[{"original_comment": "# So we can look at the progress on Tensorboard\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n#%%\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport time\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n#%%\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n#%%\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n#%%\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n#%%\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))", "target_code": "import time\n\ncallback = keras.callbacks.TensorBoard(\n log_dir='./logs/inception/2/{}'.format(time.time())\n)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# Enable callback to be able to look at the progress on Tensorboard"}, {"original_comment": " # empty arrays to be filled\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n#%%\n\n# comments beginning with #BEE were written by bee martin\n\n#%%\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n#%%\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n#%%\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n#%%\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n#%%\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n#%%\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n#%%\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n#%%\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n#%%\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n#%%\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n#%%\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n#%%\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n#%%\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n#%%\n\n# GTR: Extract the airmass and filters for each observation\n\n#%%\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n#%%\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n#%%\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n#%%\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n#%%\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n#%%\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n#%%\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n#%%\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n#%%\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n#%%\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n#%%\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n#%%\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n#%%\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n#%%\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n#%%\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n#%%\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n#%%\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n#%%\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n#%%\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n#%%\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]", "target_code": " feature_distance = np.zeros((num_of_quasars, num_features, len(zshifts)))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n\n# comments beginning with #BEE were written by bee martin\n\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n\n# GTR: Extract the airmass and filters for each observation\n\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]\n", "project_metadata": {"full_name": "RichardsGroup/LSSTprep", "description": "Repository for Richards group LSST prep work, specifically related to the AGN SC", "topics": [], "git_url": "git://github.com/RichardsGroup/LSSTprep.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2018-06-20T20:43:08Z", "size": 30265, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8424521, "Python": 6419}, "last_updated": "2020-09-28T18:32:02Z"}, "intent": " # empty arrays to be filled"}, {"original_comment": "# Show the top 10 rows of the sample\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n#%%\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n#%%\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n#%%\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n#%%\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n#%%\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n#%%", "target_code": "sample_df.head()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "intent": "# Show the top 10 rows of the sample"}, {"original_comment": "# Let's check the overall accuracy.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n#%%\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n#%%\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n#%%\n\nleads_scoring.head(5)\n\n#%%\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n#%%\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n#%%\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n#%%\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n#%%\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n#%%\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n#%%\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n#%%\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n#%%\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n#%%\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n#%%\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n#%%\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n#%%\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n#%%\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n#%%\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n#%%\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n#%%\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n#%%\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n#%%\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n#%%\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n#%%\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n#%%\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n#%%\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n#%%\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n#%%\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n#%%\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n#%%\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n#%%\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n#%%\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n#%%\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n#%%\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n#%%\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n#%%\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n#%%\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n#%%\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n#%%\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n#%%\n\nleads_scoring_model.info()\n\n#%%\n\nleads_scoring_model.shape\n\n#%%\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n#%%\n\n\n\n#%%\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n#%%\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n#%%\n\n\n\n#%%\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n#%%\n\nX_train.describe()\n\n#%%\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n#%%\n\n\n\n#%%\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n#%%\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n#%%\n\nlogreg = LogisticRegression()\n\n#%%\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n#%%\n\n\n\n#%%\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n#%%\n\ncols = X_train.columns[rfe.support_]\n\n#%%\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n#%%\n\ncols\n\n#%%\n\nX_train.shape\n\n#%%\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n#%%\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n#%%\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n#%%\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n#%%\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n#%%\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n#%%\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n#%%\n\n\n\n#%%\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n#%%", "target_code": "print(metrics.accuracy_score(y_train_pred_final.Converted,\n y_train_pred_final.Predicted_Conversion))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n\nleads_scoring.head(5)\n\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n\nround(leads_scoring.describe(), 2)\n\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n\nleads_scoring_model.info()\n\n\nleads_scoring_model.shape\n\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n\n\n\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n\n\n\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n\nX_train.describe()\n\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n\n\n\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n\nlogreg = LogisticRegression()\n\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n\n\n\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n\ncols = X_train.columns[rfe.support_]\n\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n\ncols\n\n\nX_train.shape\n\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n\n\n\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n", "project_metadata": {"full_name": "saad1504/Upgrad_DataScience_Projects", "description": "All Data Science projects completed for PGPDS by Upgrad", "topics": [], "git_url": "git://github.com/saad1504/Upgrad_DataScience_Projects.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2019-10-14T16:57:22Z", "size": 29931, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6008971, "PLSQL": 11605}, "last_updated": "2020-10-12T22:18:23Z"}, "intent": "# Let's check the overall accuracy."}, {"original_comment": " # pad zero at the end\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n#%%\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n#%%\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n#%%\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n#%%\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):", "target_code": " res_1 = np.concatenate([res_1, np.zeros(len(mix)-len(res_1))])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):\n", "project_metadata": {"full_name": "naplab/DANet", "description": "Deep Attractor Network (DANet) for single-channel speech separation", "topics": [], "git_url": "git://github.com/naplab/DANet.git", "stars": 53, "watchers": 53, "forks": 15, "created": "2018-09-18T21:26:22Z", "size": 11, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23340, "Python": 4814}, "last_updated": "2020-12-14T07:40:33Z"}, "intent": " # pad zero at the end"}, {"original_comment": "# get list of r-band images\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n#%%\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n#%%\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n#%%\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n#%%", "target_code": "imgList_r = list(glob('Inputs/photo_r_CNN*.fits'))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n", "project_metadata": {"full_name": "cbottrell/RealSim", "description": "RealSim is the statistical observational realism suite described in Bottrell et al 2017ab and made public in Bottrell et al 2019b.", "topics": [], "git_url": "git://github.com/cbottrell/RealSim.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2019-07-10T21:26:45Z", "size": 20047, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2416365, "C": 294600, "Python": 34394, "Makefile": 4159, "Tcl": 1042, "Shell": 374, "C++": 88}, "last_updated": "2020-05-29T13:33:55Z"}, "intent": "# get list of r-band images"}, {"original_comment": "# ### Use a different marker for the hue levels:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n#%%\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n#%%\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n#%%\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n#%%\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n#%%\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n#%%\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n#%%\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n#%%\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n#%%\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n#%%\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n#%%\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()", "target_code": "palette = dict(Lunch=\"blue\", Dinner=\"red\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\", palette=palette,\n hue_order=[\"Dinner\", \"Lunch\"],\n hue_kws=dict(marker=[\"^\", \"v\"]))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()\n\n\n\n", "project_metadata": {"full_name": "scidatmath2020/Ciencia-de-datos-con-Python", "description": null, "topics": [], "git_url": "git://github.com/scidatmath2020/Ciencia-de-datos-con-Python.git", "stars": 20, "watchers": 20, "forks": 27, "created": "2020-09-07T20:49:59Z", "size": 20544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5705341, "Python": 12821}, "last_updated": "2020-11-19T22:06:09Z"}, "intent": "# Use a different marker for the hue levels"}, {"original_comment": "# ### Reshape Feature\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n#%%\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n#%%\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n#%%\n\n# shape\ndf.shape\n\n#%%\n\ndf\n\n#%%\n\n# top 5 rows in dataframe\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.describe()\n\n#%%\n\n# statistical details T is transpost\ndf.describe().T\n\n#%%\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n#%%\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n#%%\n\nX\n\n#%%\n\ny\n\n\n# ## split this data into training and test sets\n\n#%%\n\n\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n#%%\n\n\n\n#%%\n\nmodel = LinearRegression()\n\n#%%\n\nmodel\n\n\n# ## Train model\n\n#%%\n\nmodel.fit()\n\n#%%\n\nmodel.fit(X_train, y_train)", "target_code": "X = X.reshape(-1, 1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n\n# shape\ndf.shape\n\n\ndf\n\n\n# top 5 rows in dataframe\ndf.head()\n\n\ndf.info()\n\n\ndf.describe()\n\n\n# statistical details T is transpost\ndf.describe().T\n\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n\nX\n\n\ny\n\n\n# ## split this data into training and test sets\n\n\n\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n\n\n\n\nmodel = LinearRegression()\n\n\nmodel\n\n\n# ## Train model\n\n\nmodel.fit()\n\n\nmodel.fit(X_train, y_train)\n\n\n\n", "project_metadata": {"full_name": "Jetsukda/ML-KBTGxMeowCode", "description": "Say \"Hello\" Machine Learning by KBTGxMeowCode", "topics": [], "git_url": "git://github.com/Jetsukda/ML-KBTGxMeowCode.git", "stars": 3, "watchers": 3, "forks": 34, "created": "2020-06-28T07:57:09Z", "size": 5316, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11012140}, "last_updated": "2020-09-01T17:59:00Z"}, "intent": "# Reshape Feature"}, {"original_comment": "# To compute the Frobenius norm:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n#%%\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n#%%\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n#%%\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n#%%\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n#%%\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#", "target_code": "A_frobenius = np.linalg.norm(A, 'fro')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#\n\n\n\n", "project_metadata": {"full_name": "garth-wells/notebooks-3M1", "description": "Jupyter notebooks (Python) for the course 3M1 at the Department of Engineering, University of Cambridge", "topics": ["linear-algebra", "singular-value-decomposition", "regression"], "git_url": "git://github.com/garth-wells/notebooks-3M1.git", "stars": 10, "watchers": 10, "forks": 18, "created": "2015-01-12T22:32:25Z", "size": 128315, "license": "bsd-2-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7472485}, "last_updated": "2021-01-04T10:34:46Z"}, "intent": "# To compute the Frobenius norm:"}, {"original_comment": "# Histograms of test input data by column\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n#%%\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n#%%\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n#%%\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n#%%\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n#%%\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n#%%\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n#%%\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n#%%\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n#%%\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n#%%\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n#%%\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n#%%\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n#%%\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n#%%\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n#%%\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n#%%\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n#%%\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n#%%\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n#%%\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n#%%\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n#%%\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n#%%", "target_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_test[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "intent": "# Histograms of test input data by column"}, {"original_comment": "# ##### Linear Support Vector Regressor from sklearn.svm\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n#%%\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n#%%\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n#%%\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n#%%\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n#%%\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n#%%\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n#%%\n\ndf['age'] = 2017 - df.yearbuilt\n\n#%%\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n#%%\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n#%%\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n#%%\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n#%%\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n#%%\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n#%%\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n#%%\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n#%%\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n#%%\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n#%%\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n#%%\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n#%%\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n#%%\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n#%%\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n#%%\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n#%%\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n#%%\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n#%%\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n#%%\n\ntrain.cluster_loc.value_counts()\n\n#%%\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n#%%\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n#%%\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n#%%\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n#%%\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n#%%\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n#%%\n\ntest.cluster_home.value_counts()\n\n#%%\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n#%%\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\nless_significant_clusters\n\n#%%\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n#%%\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n#%%\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n#%%\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n#%%\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n#%%\n\ntrain.head()\n\n#%%\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n#%%\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n#%%\n\ntrain_no_clusters.head()\n\n#%%\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n#%%\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n#%%\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n#%%\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n#%%\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n#%%\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n#%%\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n#%%\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n#%%\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n#%%\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#", "target_code": "regr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n\ndf['age'] = 2017 - df.yearbuilt\n\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n\ntrain.cluster_loc.value_counts()\n\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n\ntest.cluster_home.value_counts()\n\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\nless_significant_clusters\n\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n\ntrain.head()\n\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n\ntrain_no_clusters.head()\n\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n\n\n\n", "project_metadata": {"full_name": "CodeupClassroom/bayes-methodologies-exercises", "description": "Bayes exercises on methodologies", "topics": [], "git_url": "git://github.com/CodeupClassroom/bayes-methodologies-exercises.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2019-10-09T14:04:48Z", "size": 13779, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 17490873, "Python": 71621}, "last_updated": "2020-01-06T20:54:05Z"}, "intent": "# Linear Support Vector Regressor from sklearn.svm"}, {"original_comment": "# Download the raw-data files\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n#%%\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n#%%", "target_code": "get_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1gkgUlkaRXUzrNR_jY42ieK4xtLX3ztKX'\")\nget_ipython().system(\n \"unzip 'raw_data_files.zip' -d '/content/drive/My Drive/ICDMAI_Tutorial/'\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n", "project_metadata": {"full_name": "amitbcp/icdmai_2020", "description": "This repository is for the Session held in International Conference on Data Management, Analytics and Innovation, New Delhi 2020", "topics": ["deeplearning", "recurrent-neural-networks", "rnn-pytorch", "word-embeddings", "text-classification", "rnns", "notebooks", "stackoverflow", "tag-recommender", "recommendation-system", "svm", "onevsrest"], "git_url": "git://github.com/amitbcp/icdmai_2020.git", "stars": 7, "watchers": 7, "forks": 2, "created": "2020-01-04T04:42:01Z", "size": 13078, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2676004}, "last_updated": "2021-01-06T14:44:09Z"}, "intent": "# Download the raw-data files"}, {"original_comment": " # extract dimensions\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n#%%\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n#%%\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n#%%\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)", "target_code": " original_image_height, original_image_width = gray_img.shape\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)\n", "project_metadata": {"full_name": "howarder3/ironman2020_OpenCV_photoshop", "description": null, "topics": [], "git_url": "git://github.com/howarder3/ironman2020_OpenCV_photoshop.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-12T15:55:03Z", "size": 125635, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 131231786}, "last_updated": "2020-12-23T03:20:58Z"}, "intent": " # extract dimensions"}, {"original_comment": "# normalize the format of DATE\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n#%%\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')", "target_code": "train_target['Date'] = pd.to_datetime(train_target['Date'])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')\n", "project_metadata": {"full_name": "Quan-Sun/TADPOLE-ECE5970", "description": "machine learning with biomedical data", "topics": [], "git_url": "git://github.com/Quan-Sun/TADPOLE-ECE5970.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-11-16T21:39:24Z", "size": 15650, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5564392}, "last_updated": "2019-04-19T22:32:32Z"}, "intent": "# normalize the format of DATE"}, {"original_comment": "# ### Creating an instance of LogisticRegrssion class\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n#%%\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n#%%\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n#%%\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n#%%\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n#%%\n\nfeatures = iris_df.values\n\n#%%\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n#%%\n\n\n\n#%%\n\n# ### Split the data into training and testing data, with random seeding\n\n#%%\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)", "target_code": "from sklearn.linear_model import LogisticRegression\n\nlogReg = LogisticRegression()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n\nfeatures = iris_df.values\n\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n\n\n\n\n# ### Split the data into training and testing data, with random seeding\n\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)\n\n\n\n", "project_metadata": {"full_name": "naveen21553/ml-workshop", "description": "Machine Learning Workshop Resources", "topics": [], "git_url": "git://github.com/naveen21553/ml-workshop.git", "stars": 12, "watchers": 12, "forks": 14, "created": "2018-09-28T15:03:08Z", "size": 5274, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 685393, "Python": 11705}, "last_updated": "2020-10-11T10:46:03Z"}, "intent": "# Creating an instance of LogisticRegrssion class"}, {"original_comment": "# Check if there is any null information anywhere\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n#%%\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n#%%\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n#%%\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n#%%\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n#%%", "target_code": "train.isnull().any().any()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# Check if there is any null information anywhere"}, {"original_comment": "# Plot the counts\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n#%%\n\ndata.head()\n# look at the dataset\n\n#%%\n\ndata.info()\n# basic info of dataset\n\n#%%\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n#%%\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n#%%\n\ndata.describe()\n\n#%%\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n#%%\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n#%%\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n#%%\n\ndata.columns\n\n#%%\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n#%%\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n#%%\n\n# Data Visulaization\n\n#%%\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n#%%\n\nsns.countplot(data['room_type'])\n\n#%%\n\n# We can observe reduced preference in shared rooms\n\n#%%\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n#%%\n\n# heavily skewed\n\n#%%\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n#%%\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n#%%\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n#%%\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n#%%\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n#%%\n\ndata.corr()\n\n#%%\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n#%%\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n#%%\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n#%%\n\n# to plot locaation and price on NY city map\n\n#%%\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n#%%\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n#%%\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n#%%\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n#%%\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)", "target_code": "ax = plt.subplot(1, 1, 1)\nplt.imshow(img, 'hot')\nplt.axis('off')\nplt.tight_layout()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n\ndata.head()\n# look at the dataset\n\n\ndata.info()\n# basic info of dataset\n\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n\ndata.describe()\n\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n\ndata.columns\n\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n\n# Data Visulaization\n\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n\nsns.countplot(data['room_type'])\n\n\n# We can observe reduced preference in shared rooms\n\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n\n# heavily skewed\n\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n\ndata.corr()\n\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n\n# to plot locaation and price on NY city map\n\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)\nplt.figure(figsize=(12, 12), facecolor='black')\n", "project_metadata": {"full_name": "maheshd20/Da_project_sem5", "description": null, "topics": [], "git_url": "git://github.com/maheshd20/Da_project_sem5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-30T13:22:44Z", "size": 5278, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1470956}, "last_updated": "2020-11-30T15:37:07Z"}, "intent": "# Plot the counts"}, {"original_comment": "# Check the cinemas dataset contains any duplicated address\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n#%%\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n#%%\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n#%%\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n#%%\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n#%%\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n#%%\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n#%%\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n#%%\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n#%%\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n#%%\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n#%%\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n#%%\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n#%%\n\n', '.join([cat for cat in fs_categories])\n\n#%%\n\ncinema = df_cinemas.loc[0]\n\n#%%\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n#%%\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n#%%\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n#%%\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n#%%\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n#%%\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation", "target_code": "duplicated = df_cinemas.duplicated('Address', keep=False)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n\n', '.join([cat for cat in fs_categories])\n\n\ncinema = df_cinemas.loc[0]\n\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation\n\n\n\n", "project_metadata": {"full_name": "meghsat/CourseraIBMdatascience_course", "description": "In this repo consists of the projects I had done as part of the coursera's IBM data science Professional certificate.", "topics": [], "git_url": "git://github.com/meghsat/CourseraIBMdatascience_course.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-04-08T05:37:45Z", "size": 4855, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 14626378}, "last_updated": "2020-05-28T09:51:40Z"}, "intent": "# Check the cinemas dataset contains any duplicated address"}, {"original_comment": "# Merge the obtained results with the pulls DataFrame\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n#%%\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n#%%\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n#%%\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n#%%\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n#%%\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n#%%\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n#%%\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]", "target_code": "joined_pr = file_pr.merge(pulls, on='pid')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]\n", "project_metadata": {"full_name": "ChristianNogueira/datacamp_projects", "description": "DataCamp Projects", "topics": ["datacamp"], "git_url": "git://github.com/ChristianNogueira/datacamp_projects.git", "stars": 17, "watchers": 17, "forks": 13, "created": "2018-01-17T16:58:27Z", "size": 8441, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12129948}, "last_updated": "2020-08-21T20:03:31Z"}, "intent": "# Merge the obtained results with the pulls DataFrame"}, {"original_comment": "# Function to scale input data (`X`) for predictive purposes.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n#%%\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n#%%\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n#%%\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n#%%\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n#%%\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n#%%\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n#%%\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n#%%\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n#%%\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n#%%\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n#%%\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n#%%\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n#%%\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n#%%\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n#%%\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n#%%\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n#%%\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#", "target_code": "from sklearn.preprocessing import StandardScaler\n\ndef dataset_scaler(training_data, testing_data, obj=StandardScaler):\n \"\"\" Function to scale X-data using custom input algorithm. \"\"\"\n SCALED_FEATURES = [feature + \"_sca\" for feature in training_data]\n scaler = obj()\n scaler.fit(training_data)\n X_train_sca = pd.DataFrame(scaler.transform(\n training_data).T, SCALED_FEATURES).T\n X_test_sca = pd.DataFrame(scaler.transform(\n testing_data).T, SCALED_FEATURES).T\n return X_train_sca, X_test_sca\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#\n\n\n\n", "project_metadata": {"full_name": "AakashSudhakar/6nomads-interview-project", "description": "Interview project repository for data analysis and prediction for 6Nomads data. ", "topics": ["data-analysis", "data-processing", "data-science", "machine-learning", "data-structures"], "git_url": "git://github.com/AakashSudhakar/6nomads-interview-project.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2019-09-06T05:04:40Z", "size": 385, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 545554, "Python": 21164}, "last_updated": "2020-05-13T23:33:12Z"}, "intent": "# Function to scale input data (`X`) for predictive purposes."}, {"original_comment": "# Reshape validation data so that model can be run\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n#%%\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n#%%\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n#%%\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n#%%\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n#%%\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)", "target_code": "X_valid = X_valid.values.reshape(-1, 6)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)\n", "project_metadata": {"full_name": "abhishek3aj/ML1819--task-101--team-06", "description": "ML framework comparison", "topics": [], "git_url": "git://github.com/abhishek3aj/ML1819--task-101--team-06.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-10-09T09:48:20Z", "size": 21107, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4638466, "Python": 84406}, "last_updated": "2018-12-17T19:27:23Z"}, "intent": "# Reshape validation data"}, {"original_comment": "# Calculate the mean of overcast_daily_max\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n#%%\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n#%%\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n#%%\n\niris['sepal length (cm)'].count() # Applied to Series\n\n#%%\n\niris['sepal width (cm)'].count() # Applied to Series\n\n#%%\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n#%%\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n#%%\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n#%%\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n#%%\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n#%%\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n#%%\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n#%%\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n#%%\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n#%%\n\niris.min()\n\n#%%\n\niris.max()\n\n\n# #### Box Plots\n\n#%%\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n#%%\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n#%%\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n#%%\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n#%%\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf.fare.describe()\n\n#%%\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n#%%\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n#%%\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n#%%\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n#%%\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n#%%\n\n# Print the mean of the January and March data\njanuary.mean()\n\n#%%\n\nmarch.mean()\n\n#%%\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n#%%\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n#%%\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n#%%\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n#%%\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n#%%\n\nsetosa['species'].unique()\n\n#%%\n\nversicolor['species'].unique()\n\n#%%\n\nvirginica['species'].unique()\n\n#%%\n\nsetosa.head(2)\n\n#%%\n\nversicolor.head(2)\n\n#%%\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n#%%\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n#%%\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n#%%\n\ndescribe_all = iris.describe()\ndescribe_all\n\n#%%\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n#%%\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n#%%\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n#%%\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n#%%\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n#%%\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n#%%\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n#%%\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n#%%\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n#%%\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n#%%\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n#%%\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n#%%\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n#%%\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n#%%\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n#%%\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n#%%\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n#%%\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n#%%\n\nsales.reindex(evening_2_11, method='ffill')\n\n#%%\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf2.head()\n\n#%%\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n#%%\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n#%%\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n#%%\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n#%%\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n#%%\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n#%%\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n#%%\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n#%%\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n#%%\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n#%%\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n#%%\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n#%%\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n#%%\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n#%%\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n#%%\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n#%%\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n#%%\n\ndaily_mean.loc['2015-2-2']\n\n#%%\n\nsales.loc['2015-2-2', 'Units']\n\n#%%\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n#%%\n\nsales.resample('D').sum().head()\n\n#%%\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n#%%\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n#%%\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n#%%\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n#%%\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n#%%\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n#%%\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n#%%\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n#%%\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n#%%\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n#%%\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n#%%\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n#%%\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n#%%\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n#%%\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n#%%\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n#%%\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n#%%\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n#%%\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n#%%\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n#%%\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n#%%\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n#%%\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n#%%\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n#%%\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n#%%\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n#%%\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n#%%\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n#%%\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n#%%\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n#%%\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n#%%\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n#%%\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n#%%\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n#%%\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n#%%\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n#%%\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n#%%\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n#%%\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n#%%\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n#%%\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n#%%\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n#%%\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n#%%\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n#%%\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n#%%\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n#%%\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n#%%\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n#%%\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n#%%\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n#%%\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n#%%\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n#%%\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n#%%\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n#%%\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n#%%\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n#%%\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n#%%\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n#%%\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n#%%\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n#%%\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n#%%\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n#%%\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n#%%\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n#%%\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n#%%\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n#%%\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n#%%\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n#%%\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n#%%\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n#%%\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\ndf_clean.head(3)\n\n#%%\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n#%%\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n#%%\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n#%%\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n#%%\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n#%%\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n#%%", "target_code": "overcast_daily_max_mean = overcast_daily_max.mean()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n\niris['sepal length (cm)'].count() # Applied to Series\n\n\niris['sepal width (cm)'].count() # Applied to Series\n\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n\niris.min()\n\n\niris.max()\n\n\n# #### Box Plots\n\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf.fare.describe()\n\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n\n# Print the mean of the January and March data\njanuary.mean()\n\n\nmarch.mean()\n\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n\nsetosa['species'].unique()\n\n\nversicolor['species'].unique()\n\n\nvirginica['species'].unique()\n\n\nsetosa.head(2)\n\n\nversicolor.head(2)\n\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n\ndescribe_all = iris.describe()\ndescribe_all\n\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n\nsales.reindex(evening_2_11, method='ffill')\n\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n\ndf1.head()\n\n\ndf2.head()\n\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n\ndaily_mean.loc['2015-2-2']\n\n\nsales.loc['2015-2-2', 'Units']\n\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n\nsales.resample('D').sum().head()\n\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\ndf_clean.head(3)\n\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "intent": "# Calculate the mean of overcast_daily_max"}, {"original_comment": "# Let's separate into train and test set\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n#%%\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n#%%", "target_code": "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n data[['age', 'fare']],\n data['survived'],\n test_size=0.3,\n random_state=0)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "intent": "# Let's separate into train and test set"}, {"original_comment": "# The standard deviation of daily offence count\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n#%%\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n#%%\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n#%%\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n#%%\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n#%%\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n#%%", "target_code": "std_count = crime_by_date.groupBy() .agg(stddev(\"Count\")\n ) .withColumnRenamed(\"stddev_samp(Count)\", \"Standard Deviation\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n", "project_metadata": {"full_name": "WaicongTam/Assignment-Portfolio", "description": "This repository is showcase of the codes of my assignments. All the assignments I consider worth sharing will be updated here right after the late penalty has reached 50%.", "topics": [], "git_url": "git://github.com/WaicongTam/Assignment-Portfolio.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2019-06-01T03:27:31Z", "size": 10261, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1789685, "Java": 101530}, "last_updated": "2020-10-15T15:22:21Z"}, "intent": "# The standard deviation of daily offence count"}, {"original_comment": "# set axis labels\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n#%%\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n#%%\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n#%%\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n#%%\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n#%%\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n#%%\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n#%%\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n#%%\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n#%%\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n#%%\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n#%%\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n#%%\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n#%%\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n#%%\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n#%%\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n#%%\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n#%%\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n#%%\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n#%%\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n#%%\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n#%%\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n#%%\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n#%%\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n#%%\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n#%%\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n#%%\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n#%%\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n#%%\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n#%%\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n#%%\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)", "target_code": "ax.set_xlabel('[month]', fontsize=10)\nax.set_ylabel('[year]', fontsize=10)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "intent": "# set axis labels"}, {"original_comment": "# Plot a single hologram with the particles overlaid\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n#%%\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n#%%\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n#%%\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n#%%\n\nmodel.output_shape\n\n#%%\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n#%%\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n#%%\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n#%%\n\n15000/2\n\n#%%\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n#%%\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n#%%", "target_code": "def plot_hologram(h, img, outputs=\"none\"):\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n\nmodel.output_shape\n\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n\n15000/2\n\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "intent": "# Plot a single hologram with the particles overlaid"}, {"original_comment": "# Annotate plots\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n#%%\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n#%%\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n#%%\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n#%%\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n#%%\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n#%%\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n#%%\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n#%%\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n#%%\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n#%%\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n#%%\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n#%%\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n#%%\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n#%%\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n#%%\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n#%%\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n#%%\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n#%%\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n#%%\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n#%%\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n#%%\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n#%%\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n#%%\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n#%%\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n#%%\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])", "target_code": "subplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n", "project_metadata": {"full_name": "jpowie01/CIFAR10-HandsOn", "description": "Hands-on prepared for one of my presentations that took place on Computer Vision's mini-course at student's orgranization called \"Gradient\" (Gda\u0144sk University of Technology)", "topics": ["deep-learning", "convolutional-neural-networks", "cifar10", "jupyter-notebook", "hands-on"], "git_url": "git://github.com/jpowie01/CIFAR10-HandsOn.git", "stars": 6, "watchers": 6, "forks": 0, "created": "2018-01-03T21:22:35Z", "size": 9589, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1717141}, "last_updated": "2018-01-09T19:26:07Z"}, "intent": "# Annotate plots"}, {"original_comment": "# Write to disk\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n#%%\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n#%%\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n#%%\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n#%%\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n#%%\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n#%%\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n#%%\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed", "target_code": "df_Diffusion.to_csv(os.path.join(data_subdirectory, 'df_Diffusion.csv'))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "intent": "# Write to disk"}, {"original_comment": "# Check if 0 is in x\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n#%%\n\nimport math\nx = 1 + 1\nx\n\n#%%\n\ny = 1.0 * 2\ny\n\n#%%\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n#%%\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n#%%\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n#%%\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n#%%\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n#%%\n\n# Classic division\n3 / 5\n\n#%%\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n#%%\n\n5 % 2 # Remainder of number 1 / number 2\n\n#%%\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n#%%\n\nx = 1\ny = 1.0\nz = 2\n\n#%%\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n#%%\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n#%%\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)", "target_code": "print(0 in x)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n\nimport math\nx = 1 + 1\nx\n\n\ny = 1.0 * 2\ny\n\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n\n# Classic division\n3 / 5\n\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n\n5 % 2 # Remainder of number 1 / number 2\n\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n\nx = 1\ny = 1.0\nz = 2\n\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)\n", "project_metadata": {"full_name": "jlaura/GIS321", "description": null, "topics": [], "git_url": "git://github.com/jlaura/GIS321.git", "stars": 5, "watchers": 5, "forks": 15, "created": "2016-01-11T03:36:14Z", "size": 3772, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 250997}, "last_updated": "2017-04-06T06:32:08Z"}, "intent": "# Check if 0 is in x"}, {"original_comment": "# Turning the text into words using the nltk word_tokenizer\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n#%%\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom nltk import word_tokenize\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n#%%\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n#%%\n\ntype(raw_text)\n\n#%%\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n#%%\n\nraw_text[:100]\n\n\n# ## Tokenization", "target_code": "from nltk import word_tokenize\n\nwords_text = word_tokenize(raw_text)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n\ntype(raw_text)\n\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n\nraw_text[:100]\n\n\n# ## Tokenization\n\n\n\n", "project_metadata": {"full_name": "derekjjackson/DH_PythonLibraries_JupyterNotebooks", "description": "FIles and resources for using Data Science, Python, and Jupyter Notebooks in the practice of Digital Humanities", "topics": [], "git_url": "git://github.com/derekjjackson/DH_PythonLibraries_JupyterNotebooks.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2018-10-20T15:06:33Z", "size": 29200, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10076299}, "last_updated": "2020-12-25T21:05:12Z"}, "intent": "# Turning the text into words using the nltk word_tokenizer"}, {"original_comment": "# print frequency distributions of wines' quality\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n#%%\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n#%%\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n#%%\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n#%%\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n#%%\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n#%%\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n#%%\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\n#%%", "target_code": " print(wine_set.groupby(\"quality\").size()*100 / len(wine_set))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\ndef frequencyDists(wine_set):\n print(\"This is the frequency distribution of the wines' quality.\")\n", "project_metadata": {"full_name": "shrikant-temburwar/Wine-Quality-Dataset", "description": null, "topics": [], "git_url": "git://github.com/shrikant-temburwar/Wine-Quality-Dataset.git", "stars": 7, "watchers": 7, "forks": 13, "created": "2018-06-11T14:03:02Z", "size": 575, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 670078}, "last_updated": "2020-12-16T12:41:33Z"}, "intent": " # print frequency distributions of wines' quality"}, {"original_comment": "# the following directive activates inline plotting\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n#%%\n\n# should be version 1.x\nprint(tf.__version__)\n\n#%%", "target_code": "get_ipython().run_line_magic('matplotlib', 'inline')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n\n# should be version 1.x\nprint(tf.__version__)\n\n", "project_metadata": {"full_name": "NeilAlishev/HiCPredictor", "description": "Predict Hi-C maps from the DNA sequence using deep convolutional neural networks", "topics": [], "git_url": "git://github.com/NeilAlishev/HiCPredictor.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-01-12T17:39:25Z", "size": 25045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 9881823, "Python": 17479}, "last_updated": "2020-11-13T16:32:28Z"}, "intent": "# activate inline plotting"}, {"original_comment": "# We won't be using \"Time\" variable\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n#%%\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n#%%\n\n# ##Inspecting the data\n\n#%%\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n#%%\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n#%%\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n#%%\n\nsns.countplot(x='Class', data=dat)", "target_code": "dat = dat.drop(['Time'], 1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n\n# ##Inspecting the data\n\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n\nsns.countplot(x='Class', data=dat)\n\n\n\n", "project_metadata": {"full_name": "dpanagop/ML_and_AI_examples", "description": null, "topics": [], "git_url": "git://github.com/dpanagop/ML_and_AI_examples.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2019-07-16T10:55:13Z", "size": 12192, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5902376}, "last_updated": "2020-11-24T20:45:33Z"}, "intent": "# We won't be using \"Time\" variable"}, {"original_comment": "# ### Deploy the model\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n#%%\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n#%%\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n#%%\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n#%%\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n#%%\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n#%%\n\nic.fit(inputs=s3_channels)", "target_code": "c5_predictor = ic.deploy(initial_instance_count=1,\n instance_type='ml.c5.large',\n endpoint_name=endpoint_name,\n wait=False)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n\nic.fit(inputs=s3_channels)\n\n\n\nendpoint_name = 'c5-'+time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n\n", "project_metadata": {"full_name": "PacktPublishing/Learn-Amazon-SageMaker", "description": "Learn Amazon SageMaker", "topics": [], "git_url": "git://github.com/PacktPublishing/Learn-Amazon-SageMaker.git", "stars": 30, "watchers": 30, "forks": 20, "created": "2020-04-22T14:55:25Z", "size": 47447, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2818256, "Python": 146100, "R": 2078, "Dockerfile": 738}, "last_updated": "2020-12-29T08:53:02Z"}, "intent": "# Deploy the model"}, {"original_comment": "# and capture it into a dataframe\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n#%%\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n#%%\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n#%%\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n#%%\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n#%%\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n#%%\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n#%%\n\ndata['sex'].unique()\n\n#%%\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n#%%\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n#%%\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n#%%\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()", "target_code": "prob_df = pd.DataFrame(prob_df)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n\ndata['sex'].unique()\n\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "intent": "# capture it into a dataframe"}, {"original_comment": "# Make a copy in the specific subfolder\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))", "target_code": "df_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "intent": "# Make a copy in the specific subfolder"}, {"original_comment": "# Find value counts for every column in reviews.csv\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n#%%\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n#%%\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n#%%\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n#%%\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n#%%\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n#%%\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n#%%\n\ndf_reviews.head()\n\n#%%\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n#%%\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n#%%\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n#%%\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n#%%\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n#%%", "target_code": "df_reviews.apply(lambda x: x.isnull().value_counts())\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n\ndf_reviews.head()\n\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n", "project_metadata": {"full_name": "sheetalbongale/ALE-gorithm", "description": "All things Beer! Beer Educator and Recommender Web App | Deployed on GCP > https://alegorithm-fxljyqhslq-uc.a.run.app/ | UT Data Analysis and Visualization Nov 2019 - May 2020. ", "topics": ["recommender", "gcp-cloud-build", "python-flask-application", "sqlalchemy", "plotlyjs", "anychart-javascript-library", "d3js", "mysql"], "git_url": "git://github.com/sheetalbongale/ALE-gorithm.git", "stars": 5, "watchers": 5, "forks": 5, "created": "2020-03-01T22:59:58Z", "size": 56307, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 204948, "JavaScript": 52038, "CSS": 48412, "HTML": 46213, "Python": 15403, "Dockerfile": 433}, "last_updated": "2020-05-07T08:39:07Z"}, "intent": "# Find value counts for every column in reviews.csv"}, {"original_comment": "# Add the title\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)", "target_code": "plt.title('Fuel efficiency vs Horse-power')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "intent": "# Add the title"}, {"original_comment": "# Add output layer\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n#%%\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n#%%\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n#%%\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n#%%\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n#%%\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n#%%\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n#%%\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n#%%\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n#%%\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n#%%\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n#%%\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n#%%\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n#%%\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n#%%\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n#%%\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n#%%\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n#%%\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n#%%\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n#%%\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n#%%\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n#%%\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))", "target_code": "M1.add(Dense(1, activation='sigmoid'))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))\n", "project_metadata": {"full_name": "insight-decentralized-consensus-lab/TrustKeeper", "description": "A fraud prevention system for Peer-to-Peer transaction networks (Jahir M Gutierrez)", "topics": [], "git_url": "git://github.com/insight-decentralized-consensus-lab/TrustKeeper.git", "stars": 8, "watchers": 8, "forks": 7, "created": "2018-09-28T20:15:21Z", "size": 10845, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 614132, "Scala": 19178, "Python": 14102}, "last_updated": "2020-03-18T22:55:35Z"}, "intent": "# Add output layer"}, {"original_comment": " # as usual, we use text-embeddings\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n#%%\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n#%%\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n#%%\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n#%%\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n#%%\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\n\n\n#%%\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n#%%\n\ncorpus\n\n#%%\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n#%%\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n#%%\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n#%%\n\npred\n\n#%%\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n#%%\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values", "target_code": " text = text_embedding(text)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\n\n\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n\ncorpus\n\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n\npred\n\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "intent": " # use text-embeddings"}, {"original_comment": "# ### Put them all together in multiplots\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n#%%\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n#%%\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n#%%\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n#%%\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n#%%\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n#%%\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n#%%\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n#%%\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n#%%\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n#%%\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)", "target_code": "pzb.plot_multi(names=['fof', 'zz', 'pdf', 'pit'], verbose=1, save_plot=1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)\n\n\n\n", "project_metadata": {"full_name": "LSSTDESC/pz_blend", "description": "impact of blending on photo-zs using DC2 truth catalogs and image catalogs", "topics": [], "git_url": "git://github.com/LSSTDESC/pz_blend.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-03-12T22:06:14Z", "size": 2183, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1709826, "Python": 89195}, "last_updated": "2020-12-09T18:50:43Z"}, "intent": "# Put them all together in multiplots"}, {"original_comment": "# set axis labels\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n#%%\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n#%%\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n#%%\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n#%%\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n#%%\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n#%%\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n#%%\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n#%%\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n#%%\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n#%%\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n#%%\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n#%%\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n#%%\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n#%%\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n#%%\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n#%%\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n#%%\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n#%%\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n#%%\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n#%%\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n#%%\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n#%%\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n#%%\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n#%%\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n#%%\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n#%%\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n#%%\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n#%%\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n#%%\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n#%%\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n#%%\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n#%%\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n#%%\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n#%%\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n#%%\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n#%%\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n#%%\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)", "target_code": "plt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "intent": "# set axis labels"}, {"original_comment": "# Show with Legend\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n#%%\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n#%%\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n#%%\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n#%%\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n#%%\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)", "target_code": "plt.show()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)\nplt.legend()\nplt.savefig('lumpsum.png')\n", "project_metadata": {"full_name": "eonofrey/DollarCostAverage_vs._LumpSum", "description": "Comparing dollar cost averaging vs. lump sum investment in the SPY ", "topics": [], "git_url": "git://github.com/eonofrey/DollarCostAverage_vs._LumpSum.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-06-19T21:58:51Z", "size": 1525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 587938}, "last_updated": "2020-12-19T01:53:56Z"}, "intent": "# Show with Legend"}, {"original_comment": "# ### Boxplots of Amount by Gender\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n#%%\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n#%%\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')", "target_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', x='gender', data=gender_amt, hue='gender',\n dodge=False, width=0.6, palette='Set2')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')\n\n\n\ngender_amt = pd.DataFrame(fraud_df.head(100), columns=['amt', 'gender'])\n\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "intent": "# Boxplots of Amount by Gender"}, {"original_comment": "# ## Read in information\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)", "target_code": "PLdb = pd.read_csv(\n '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)\n\n\n\nDATE = '04202020'\n", "project_metadata": {"full_name": "MrOlm/covid19_population_genomics", "description": "Analysis of the population diversity of SARS-CoV-2 within and between individual patients", "topics": [], "git_url": "git://github.com/MrOlm/covid19_population_genomics.git", "stars": 9, "watchers": 9, "forks": 1, "created": "2020-03-20T16:01:19Z", "size": 170583, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 40959012, "Python": 1028}, "last_updated": "2020-12-05T12:24:09Z"}, "intent": "# Read in information"}, {"original_comment": "# ### The following code cells shape the new arrays.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n#%%\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n#%%\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n#%%\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n#%%\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n#%%\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n#%%\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n#%%\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n#%%\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)", "target_code": "np.shape(d_norm)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)\n\n\n\n", "project_metadata": {"full_name": "abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation", "description": "initial commit", "topics": [], "git_url": "git://github.com/abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-11-06T17:59:31Z", "size": 13456, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5835338, "Python": 6376}, "last_updated": "2020-11-12T20:56:51Z"}, "intent": "# shape array"}, {"original_comment": "# Shifted dataset\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n#%%\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n#%%\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n#%%\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n#%%\n\nout\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n#%%\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n#%%\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n#%%\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n#%%", "target_code": "train_20, validation_20 = get_MNIST_data(shift=20)\ntrain_20 = rescale(train_20)\nvalidation_20 = rescale(validation_20)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n\nout\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n\ntrain, validation = get_MNIST_data()\n\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n", "project_metadata": {"full_name": "elahea2020/6.036", "description": "Homework solutions of Intro to ML course at MIT Spring 2018", "topics": ["ml", "machine-learning", "machine-learning-algorithms", "mit", "6036", "perceptron-learning-algorithm", "rnn"], "git_url": "git://github.com/elahea2020/6.036.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2018-05-08T21:21:54Z", "size": 65530, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 18939819, "Python": 168769}, "last_updated": "2020-10-25T08:09:38Z"}, "intent": "# Shifted dataset"}, {"original_comment": " # Tokenize words in reviews:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\nimport nltk\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n#%%\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n#%%\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n#%%\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n#%%\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''", "target_code": " import nltk\n\n df_with_reviews['rev_cleaned'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: nltk.word_tokenize(x))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''\n", "project_metadata": {"full_name": "MeredithLevsen/InsightProject", "description": "GameOn - Quickly evaluate board games based on user reviews", "topics": [], "git_url": "git://github.com/MeredithLevsen/InsightProject.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2018-07-17T17:31:15Z", "size": 541, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1045270, "HTML": 265408}, "last_updated": "2018-12-04T03:47:10Z"}, "intent": " # Tokenize words in reviews:"}, {"original_comment": "# Training the classifier with NaiveBayes algorithm\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n#%%\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n#%%\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n#%%\n\nprint(spam.head())\n\n#%%\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n#%%\n\nprint(data_set[:5])\n\n#%%\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n#%%\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n#%%\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n#%%\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n#%%\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n#%%\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n#%%\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n#%%\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n#%%\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n#%%\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n#%%\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n#%%\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n#%%\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n#%%\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n#%%\n\nprint(training_set[:5])\n\n#%%\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n#%%", "target_code": "spamClassifier = nltk.NaiveBayesClassifier.train(training_set)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n\nprint(spam.head())\n\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n\nprint(data_set[:5])\n\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n\nprint(training_set[:5])\n\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n", "project_metadata": {"full_name": "beingdatum/NaturalLanguageProcessing", "description": null, "topics": [], "git_url": "git://github.com/beingdatum/NaturalLanguageProcessing.git", "stars": 3, "watchers": 3, "forks": 10, "created": "2020-01-01T13:54:22Z", "size": 23376, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2267856, "Python": 1378}, "last_updated": "2020-06-08T09:54:47Z"}, "intent": "# Training the classifier with NaiveBayes algorithm"}, {"original_comment": "# Tweak spacing to prevent clipping of ylabel\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n#%%\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n#%%\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n#%%\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n#%%\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n#%%\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n#%%\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n#%%\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n#%%\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n#%%\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n#%%\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n#%%\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n#%%\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n#%%\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n#%%\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n#%%\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n#%%\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n#%%\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n#%%\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n#%%\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n#%%\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n#%%\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n#%%\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')", "target_code": "plt.subplots_adjust(left=0.15)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')\n", "project_metadata": {"full_name": "kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN", "description": "Predict HDR images from LDR images using CNN", "topics": [], "git_url": "git://github.com/kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2017-07-10T10:31:45Z", "size": 16499, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 96258, "Python": 44059, "MATLAB": 26466, "Shell": 15315, "M": 423}, "last_updated": "2020-07-07T08:49:43Z"}, "intent": "# Tweak spacing to prevent clipping of ylabel"}, {"original_comment": "# Set notebook to full width\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\n#%%", "target_code": "from IPython.core.display import display, HTML\ndisplay(HTML(\"\"))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_moons, make_circles\nimport tensorflow as tf\nimport numpy as np\nimport math\n", "project_metadata": {"full_name": "mmarius/nnia-tutorial", "description": "Repository for my tutorial group which is part of the lecture Neural Networks: Implementation and Application", "topics": [], "git_url": "git://github.com/mmarius/nnia-tutorial.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2017-11-02T15:20:51Z", "size": 12430, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1494110}, "last_updated": "2020-05-07T22:34:03Z"}, "intent": "# Set notebook to full width"}, {"original_comment": "# print out all data shapes\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n#%%\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n#%%\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n#%%\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)", "target_code": "print('Training set', train_dataset.shape, train_labels.shape)\nprint('Validation set', validation_dataset.shape, validation_labels.shape)\nprint('Test set', test_dataset.shape, test_labels.shape)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)\n", "project_metadata": {"full_name": "cwru-robotics/cwru_dnn", "description": "deep neural net explorations", "topics": [], "git_url": "git://github.com/cwru-robotics/cwru_dnn.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-07-25T14:47:31Z", "size": 49625, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 329694, "Python": 19000, "C++": 17781, "CMake": 7310}, "last_updated": "2020-03-13T14:59:53Z"}, "intent": "# print out all data shapes"}, {"original_comment": "# Initialize from pre-trained model\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n#%%\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n#%%\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n#%%\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n#%%\n\ndf['source'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'partition']).size()\n\n#%%\n\ndf['digit'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n#%%\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n#%%\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n#%%\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n#%%\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n#%%\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n#%%\n\ntarget_shape = (32, 32)\n\n#%%\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n#%%\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n#%%\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n#%%\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n#%%\n\nwandb.init()\n\n\n# ### Run Training\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n#%%\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n#%%\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)", "target_code": "model.load_weights(MODEL_PATH_HEAD)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n\ndf['source'].value_counts()\n\n\ndf.groupby(['source', 'partition']).size()\n\n\ndf['digit'].value_counts()\n\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n\ntarget_shape = (32, 32)\n\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n\nMODEL_DIR\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n\nwandb.init()\n\n\n# ### Run Training\n\n\nMODEL_DIR\n\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "intent": "# Initialize from pre-trained model"}, {"original_comment": "# ask Yahoo! Finance for data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n#%%\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n#%%\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n#%%\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n#%%\n\n# print the type of an object\ntype(df)\n\n#%%\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n#%%\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n#%%\n\n# list the index of the DataFrame\ndf.index\n\n#%%\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n#%%\n\n# print first 5 rows\ny.head()\n\n#%%\n\n# print type of the ts object\ntype(y)\n\n#%%\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n#%%\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n#%%\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n#%%\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n#%%", "target_code": "corporate_actions = web.DataReader(tickers, 'yahoo-actions', start, end)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n\n# print the type of an object\ntype(df)\n\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n\n# list the index of the DataFrame\ndf.index\n\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n\n# print first 5 rows\ny.head()\n\n\n# print type of the ts object\ntype(y)\n\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n", "project_metadata": {"full_name": "dacatay/time-series-analysis", "description": "Presentation for time series analysis", "topics": [], "git_url": "git://github.com/dacatay/time-series-analysis.git", "stars": 41, "watchers": 41, "forks": 53, "created": "2017-09-08T13:45:56Z", "size": 43990, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12370243, "R": 4829}, "last_updated": "2020-11-05T10:34:15Z"}, "intent": "# ask Yahoo! Finance for data"}, {"original_comment": "# ### 1.5 Distribution of rgb channels of negative examples\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n#%%\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n#%%\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n#%%\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n#%%\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()", "target_code": "for n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\n", "project_metadata": {"full_name": "itratrahman/covid_19", "description": "This project contains AI and Data Science projects that analyses disease classification from images, forecasting, and EDA report of the pandemic.", "topics": [], "git_url": "git://github.com/itratrahman/covid_19.git", "stars": 5, "watchers": 5, "forks": 0, "created": "2020-03-22T03:36:28Z", "size": 26502, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6190010}, "last_updated": "2020-04-28T07:40:43Z"}, "intent": "# 1.5 Distribution of rgb channels of negative examples"}, {"original_comment": "# # Correlation between variables\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n#%%\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n#%%\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n#%%\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.describe()\n\n#%%\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.isnull().any()", "target_code": "corr_mat = df[variables].corr().round(2)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n\ndf.columns\n\n\ndf.describe()\n\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n\ndf.head()\n\n\ndf.isnull().any()\n\n\n\nvariables = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',\n 'Ticket', 'Fare', 'Embarked', 'Survived', 'Initial']\n# Calculate the correlations\n", "project_metadata": {"full_name": "ghimireadarsh/AI-WinterSchool", "description": "Comprises of various lecture slides, papers, practical notebooks used during AI Winter school, organized by NAAMII at Pokhara, Nepal from December 10, 2019 to December 20, 2019. ", "topics": [], "git_url": "git://github.com/ghimireadarsh/AI-WinterSchool.git", "stars": 6, "watchers": 6, "forks": 6, "created": "2019-12-14T18:16:09Z", "size": 75918, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1041087, "HTML": 666537, "Python": 20395}, "last_updated": "2020-09-27T21:32:34Z"}, "intent": "# Correlation between variables"}, {"original_comment": "# And the smallest?\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's start with the basics\n\n#%%\n\nsimple = list(range(1, 19))\nsimple\n\n#%%\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n#%%\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n#%%\n\n# Select the last item using positive indexation\nsimple[17]\n\n#%%\n\n# Select the last item using negative indexation\nsimple[-1]\n\n#%%\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n#%%\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n#%%\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n#%%\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n#%%\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n#%%\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n#%%\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n#%%\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n#%%\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n#%%\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n#%%\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n#%%\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n#%%\n\n# What is the biggest number in the list?\nmax(simple)\n\n#%%", "target_code": "min(simple)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's start with the basics\n\n\nsimple = list(range(1, 19))\nsimple\n\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n\n# Select the last item using positive indexation\nsimple[17]\n\n\n# Select the last item using negative indexation\nsimple[-1]\n\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n\n# What is the biggest number in the list?\nmax(simple)\n\n", "project_metadata": {"full_name": "shotleft/how-to-python", "description": null, "topics": [], "git_url": "git://github.com/shotleft/how-to-python.git", "stars": 11, "watchers": 11, "forks": 4, "created": "2018-05-03T04:32:17Z", "size": 3364, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2974562}, "last_updated": "2020-12-05T20:07:29Z"}, "intent": "# And the smallest?"}, {"original_comment": "# Create figure and dimensions\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n#%%\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------", "target_code": "plt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------\n", "project_metadata": {"full_name": "isaacarroyov/ss_plots", "description": "Repositorio de gr\u00e1ficas realizadas en Python para mis boletines de servicio social (Ecuaciones Diferenciales y An\u00e1lisis Vectorial) || Repository of the plots made in Python for my social service bulletins (Differential Equations and Vector Calculus)", "topics": ["differential-equations", "math", "vector-analysis", "university", "python3", "python", "ecuaciones-diferenciales"], "git_url": "git://github.com/isaacarroyov/ss_plots.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-27T19:15:30Z", "size": 21849, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 29758848}, "last_updated": "2020-11-24T18:53:41Z"}, "intent": "# Create figure and dimensions"}, {"original_comment": "# ### MinMax Scaling\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n#%%\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n#%%\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data", "target_code": " import numpy as np\n\n numerator = data - np.min(data, 0)\n denominator = np.max(data, 0) - np.min(data, 0)\n return numerator / (denominator + 1e-7)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data\n\n\n\ndef MinMaxScaler(data):\n ''' Min Max Normalization\n Parameters\n ----------\n data : numpy.ndarray\n input data to be normalized\n shape: [Batch size, dimension]\n Returns\n ----------\n data : numpy.ndarry\n normalized data\n shape: [Batch size, dimension]\n References\n ----------\n .. [1] http://sebastianraschka.com/Articles/2014_about_feature_scaling.html\n '''\n", "project_metadata": {"full_name": "jwlee-ml/TensorFlow_Training_13th", "description": "Tensorflow\ub85c \uc2dc\uc791\ud558\ub294 \ub525\ub7ec\ub2dd Camp 13\uae30 \uc2e4\uc2b5", "topics": [], "git_url": "git://github.com/jwlee-ml/TensorFlow_Training_13th.git", "stars": 4, "watchers": 4, "forks": 5, "created": "2019-06-14T14:39:05Z", "size": 23519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23325250}, "last_updated": "2019-11-05T13:31:34Z"}, "intent": " # MinMax Scaling"}, {"original_comment": "# #### Submit the job\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n#%%\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker.estimator import Estimator\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n#%%\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)", "target_code": "from sagemaker.estimator import Estimator\n\nestimator = Estimator(role=role,\n train_instance_count=n_instances,\n train_instance_type=instance_type,\n image_name=image_name,\n hyperparameters=hyperparameters)\nestimator.fit()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)\n\n\n\n", "project_metadata": {"full_name": "daniel-fudge/DRL-Portfolio-Optimization-Custom", "description": "A portfolio optimization framework leveraging Deep Reinforcement Learning (DRL) and a custom trading environment", "topics": [], "git_url": "git://github.com/daniel-fudge/DRL-Portfolio-Optimization-Custom.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-06-12T22:27:29Z", "size": 35064, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1170339, "Python": 39958, "Shell": 4637}, "last_updated": "2020-11-01T22:06:49Z"}, "intent": "# Submit the job"}, {"original_comment": " # make sure output directory exists\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n#%%\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):", "target_code": " output_dir = pathlib.Path(output_dir)\n output_dir.mkdir(exist_ok=True)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):\n", "project_metadata": {"full_name": "thehackerwithin/illinois", "description": "THW Chapter at U. Illinois", "topics": [], "git_url": "git://github.com/thehackerwithin/illinois.git", "stars": 13, "watchers": 13, "forks": 31, "created": "2015-02-18T19:38:33Z", "size": 61361, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 868658, "TeX": 34143, "R": 18922, "HTML": 10291, "Julia": 5254, "Python": 4028, "C++": 425, "CMake": 94}, "last_updated": "2020-09-30T18:16:33Z"}, "intent": " # make sure output directory exists"}, {"original_comment": "# ### Compute the explained variance for new data set.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n#%%\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata.isnull().sum()\n\n#%%\n\ndata.dropna(inplace=True)\n\n#%%\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n#%%\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)", "target_code": "pca.explained_variance_\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata.isnull().sum()\n\n\ndata.dropna(inplace=True)\n\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)\n\n\n\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# Compute the explained variance for new data set."}, {"original_comment": "# init the Support Vector Machine classifier\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n#%%\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n#%%\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n#%%\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n#%%\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n#%%\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n#%%\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n#%%\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n#%%\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n#%%\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n#%%\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n#%%\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n#%%\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n#%%\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n#%%\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n#%%\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n#%%\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n#%%\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n#%%\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n#%%\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n#%%\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n#%%\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n#%%\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n#%%\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n#%%\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n#%%\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n#%%\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n#%%", "target_code": "svm = SVC(kernel='linear', C=1, random_state=random_seed)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "intent": "# init the Support Vector Machine classifier"}, {"original_comment": "# Reload with new savename\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n#%%\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n#%%\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n#%%\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n#%%\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n#%%\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n#%%\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n#%%\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n#%%\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n#%%\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n#%%\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n#%%\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n#%%", "target_code": "learn = create_learner(data, 'stage2-bestmodel')\nlearn.load('stage1-bestmodel')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n", "project_metadata": {"full_name": "URBNOpenSource/custom-vision-study", "description": null, "topics": [], "git_url": "git://github.com/URBNOpenSource/custom-vision-study.git", "stars": 5, "watchers": 5, "forks": 4, "created": "2019-03-12T20:31:02Z", "size": 19785, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5009642, "Python": 5509, "Shell": 928}, "last_updated": "2019-10-24T13:27:26Z"}, "intent": "# Reload with new savename"}, {"original_comment": "# ## C. Calculate the Lateral Inflow Hydrographs:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n#%%\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n#%%\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n#%%\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n#%%\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n#%%\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n#%%\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n#%%\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n#%%\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n#%%\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---", "target_code": " ReducedTable = calc_lateral_inflow_hydro(\n lid, ReducedTable, StormwaterTable, durations, BCN, display_print)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---\n\n\n\nif lid.shape[0] > 0:\n", "project_metadata": {"full_name": "Dewberry/pfra-hydromet", "description": "Tools for developing pluvial (excess rainfall) and fluvial scenarios for probabilistic flood risk analyses", "topics": ["hydrology", "papermill", "montecarlo-simulation"], "git_url": "git://github.com/Dewberry/pfra-hydromet.git", "stars": 11, "watchers": 11, "forks": 12, "created": "2019-04-18T13:04:55Z", "size": 165396, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 59869977, "Python": 186157}, "last_updated": "2020-10-27T14:37:20Z"}, "intent": " # C. Calculate the Lateral Inflow Hydrographs:"}, {"original_comment": "# output layer\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n#%%\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n#%%\n\n'''\nLSTM playground\n'''\n\n#%%\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n#%%\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n#%%\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n#%%\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n#%%\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n#%%\n\nbatchedInputMatrix.shape\n\n#%%\n\nbatchedOutputMatrix.shape\n\n#%%\n\nbatchedInputMatrix[0, 0]\n\n#%%\n\nbatchedOutputMatrix[0, 0]\n\n#%%\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n#%%\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n#%%\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n#%%\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))", "target_code": "simpleStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n'''\nLSTM playground\n'''\n\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n\nbatchedInputMatrix.shape\n\n\nbatchedOutputMatrix.shape\n\n\nbatchedInputMatrix[0, 0]\n\n\nbatchedOutputMatrix[0, 0]\n\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))\n", "project_metadata": {"full_name": "miroenev/teach_DL", "description": null, "topics": [], "git_url": "git://github.com/miroenev/teach_DL.git", "stars": 36, "watchers": 36, "forks": 15, "created": "2017-07-19T18:01:29Z", "size": 98182, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12259106, "Python": 43930, "Dockerfile": 2478, "Shell": 1713}, "last_updated": "2020-09-04T16:13:54Z"}, "intent": "# add output layer"}, {"original_comment": "# instead we can reindex:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n#%%\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n#%%\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n#%%\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n#%%\n\ncombine[0]", "target_code": "combine.index = range(combine.count())\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n\ncombine[0]\n\n\n\n", "project_metadata": {"full_name": "ContextLab/CDL-tutorials", "description": "Repo containing useful tutorials on different topics, methods, software tools, and packages used by the CDL", "topics": ["tutorial", "training-materials", "python", "bayesian-methods", "package-creation", "scientific-computing"], "git_url": "git://github.com/ContextLab/CDL-tutorials.git", "stars": 12, "watchers": 12, "forks": 2, "created": "2017-12-15T13:36:50Z", "size": 59045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 694197, "Python": 17099, "TeX": 9149, "Makefile": 5644, "Batchfile": 5096, "Dockerfile": 3050, "Shell": 128}, "last_updated": "2020-07-13T19:39:57Z"}, "intent": "# we can reindex:"}, {"original_comment": "# access items from a list\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n#%%\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n#%%\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n#%%\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n#%%\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n#%%\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n#%%\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n#%%\n\nsavelist # list of lists\n\n#%%\n\n# we can check the items of the list we created\nsavelist[1]\n\n#%%", "target_code": "savelist[0][1]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\nsavelist # list of lists\n\n\n# we can check the items of the list we created\nsavelist[1]\n\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "intent": "# access items from a list"}, {"original_comment": "# Transform the dates from strings to integers\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n#%%\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n#%%\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n#%%\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n#%%\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n#%%\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n#%%\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n#%%\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n#%%\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n#%%\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n#%%\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n#%%\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n#%%\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n#%%\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n#%%\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n#%%\n\ncorpus = train_df.bow\n\n#%%\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n#%%\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n#%%\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n#%%\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n#%%\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n#%%\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n#%%\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n#%%\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n#%%\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n#%%\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n#%%\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n#%%\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n#%%\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n#%%\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n#%%\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n#%%\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n#%%\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n#%%\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n#%%\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n#%%\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n#%%\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n#%%\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n#%%\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n#%%\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n#%%\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n#%%\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n#%%\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n#%%\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n#%%\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n#%%\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n#%%\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n#%%\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])", "target_code": "for i in range(0, len(date_ads)):\n date_ads[i] = int(date_ads[i])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n\ncorpus = train_df.bow\n\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])\n", "project_metadata": {"full_name": "NewsEye/NLP-Notebooks-Newspaper-Collections", "description": "A collection of notebooks for Natural Language Processing", "topics": ["lda", "topic-modeling", "shannon", "nlp-notebooks", "digital-humanities", "newspaper-collections", "newspaper-clippings", "text-classification", "similarity"], "git_url": "git://github.com/NewsEye/NLP-Notebooks-Newspaper-Collections.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-07-06T11:18:13Z", "size": 12866, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4306857}, "last_updated": "2020-12-01T08:54:40Z"}, "intent": "# Transform the dates from strings to integers"}, {"original_comment": "# Let's first look at the data frame.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n#%%\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing", "target_code": "rating.head()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing\n\n\n\n", "project_metadata": {"full_name": "liyenhsu/restaurant-data-with-consumer-ratings", "description": "Build recommender systems for restaurants", "topics": [], "git_url": "git://github.com/liyenhsu/restaurant-data-with-consumer-ratings.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2017-11-09T05:11:58Z", "size": 1373, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1230183}, "last_updated": "2020-10-11T20:40:42Z"}, "intent": "# look at the data frame"}, {"original_comment": "# Lists can also be added to:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n#%%\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n#%%\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n#%%\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n#%%\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n#%%\n\ntype(my_decimal)\n\n#%%\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n#%%\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n#%%\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n#%%\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#", "target_code": "my_number_list.append(6)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n\ntype(my_decimal)\n\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#\n\n\n\n", "project_metadata": {"full_name": "wrightaprilm/CompBio2018", "description": null, "topics": [], "git_url": "git://github.com/wrightaprilm/CompBio2018.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2018-08-14T16:08:48Z", "size": 8976, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4697133, "TeX": 4884, "Python": 4399}, "last_updated": "2019-06-27T20:53:35Z"}, "intent": "# add element to list"}, {"original_comment": "# ### Fit and predict Time Serie\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n#%%\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n#%%\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n#%%\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n#%%\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n#%%\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n#%%\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]", "target_code": " model = hw.ExponentialSmoothing(\n ts_train, seasonal='additive', seasonal_periods=train_data_length-1).fit()\n predictions = model.predict(start=ts_test.index[0], end=ts_test.index[-1])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]\n\n\n\ndef print_hw_parameters(model):\n alpha, beta, gamma = model.params['smoothing_level'], model.params[\n 'smoothing_slope'], model.params['smoothing_seasonal']\n print(\"Holt-Winters parameters:\")\n print(\"Alpha: \", alpha)\n print(\"Beta: \", beta)\n print(\"Gamma: \", gamma)\n\n\nprint(\"Forecasting started...\")\nstart_time = time.time()\n\ntry:\n", "project_metadata": {"full_name": "CSIRT-MU/QoSForecastLSTM", "description": "An evaluation of QoS forecast methods described in paper Quality of Service Forecasting with LSTM Neural Networks", "topics": ["publication"], "git_url": "git://github.com/CSIRT-MU/QoSForecastLSTM.git", "stars": 4, "watchers": 4, "forks": 2, "created": "2018-09-05T07:37:36Z", "size": 10237, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16021131}, "last_updated": "2020-03-27T12:49:41Z"}, "intent": " # Fit and predict Time Serie"}, {"original_comment": "# Plot the histogram for the values of each year.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n#%%\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n#%%\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n#%%\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n#%%\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n#%%\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n#%%\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n#%%\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n#%%\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n#%%\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n#%%\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n#%%", "target_code": "plt.figure()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# Plot the histogram for the values of each year."}, {"original_comment": "# #### Check Missing Value\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n#%%\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n#%%\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n#%%\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()", "target_code": "for col in df_train.columns:\n if any(df_train[col].isnull()):\n print(\"feature %s, missing %i entries\" %\n (col, sum(df_train[col].isnull())))\n else:\n print(\"feature %s has no missing value\" % col)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()\n\n\n\n", "project_metadata": {"full_name": "shawlu95/Data-Science-Toolbox", "description": "Examples and illustration of basic statistic concepts, probability distribution, Monte Carlo simulation, preprocessing and visualization techniques, and statistical testing.", "topics": [], "git_url": "git://github.com/shawlu95/Data-Science-Toolbox.git", "stars": 28, "watchers": 28, "forks": 11, "created": "2019-03-25T19:58:55Z", "size": 157445, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 52401937, "Python": 36992, "TSQL": 3834, "PLpgSQL": 3609, "Shell": 3459, "R": 1437}, "last_updated": "2020-12-26T18:51:43Z"}, "intent": "# Check Missing Value"}, {"original_comment": "# sample from the forecasted distribution\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n#%%\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n#%%\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n#%%\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n#%%\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n#%%\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n#%%\n\nX_train.shape, Y_train.shape\n\n#%%\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n#%%\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n#%%\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n#%%\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n#%%\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n#%%\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n#%%\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n#%%\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n#%%\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n#%%\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n#%%\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n#%%\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n#%%\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n#%%\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n#%%\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n#%%\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n#%%\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n#%%\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n#%%\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n#%%\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n#%%\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n#%%\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n#%%\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n#%%\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n#%%\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n#%%", "target_code": "smpl = yhat.sample(100).numpy()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n\nX_train.shape, Y_train.shape\n\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n", "project_metadata": {"full_name": "sinusgamma/multimodal_network", "description": "Mixture Density Network with Tensorflow Probability. Demonstrate the usefulness of multi-modal distribution outputs for neural networks.", "topics": [], "git_url": "git://github.com/sinusgamma/multimodal_network.git", "stars": 11, "watchers": 11, "forks": 0, "created": "2020-03-08T10:08:43Z", "size": 3194, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1217660}, "last_updated": "2021-01-04T15:29:04Z"}, "intent": "# sample from the forecasted distribution"}, {"original_comment": "# normalize all numeric attributes to the range [0,1]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n#%%\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n#%%\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n#%%\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n#%%\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n#%%\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n#%%\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n#%%\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n#%%\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n#%%\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n#%%\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n#%%\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n#%%\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n#%%\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n#%%\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n#%%\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n#%%\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)", "target_code": "ori_dataset_num_processed = (\n numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "intent": "# normalize all numeric attributes to the range [0,1]"}, {"original_comment": "# To cast data into float32 type\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n#%%\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n#%%\n\ntrain_df.head()\n\n#%%\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n#%%\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)", "target_code": "train_data = train_data.astype('float32')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n\ntrain_df.head()\n\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)\n", "project_metadata": {"full_name": "aditya2000/MNIST-Fashion-", "description": null, "topics": [], "git_url": "git://github.com/aditya2000/MNIST-Fashion-.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2019-07-10T10:06:01Z", "size": 40, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 76938}, "last_updated": "2020-09-28T23:05:02Z"}, "intent": "# To cast data into float32 type"}, {"original_comment": "# plot the distribution of sample means\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n#%%\n\nfrom numpy.random import randint\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n#%%\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n#%%\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n#%%\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n#%%", "target_code": "from matplotlib import pyplot as plt\n\nplt.hist(means_100)\nplt.show()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n\nfrom numpy.random import randint\nimport numpy as np\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n", "project_metadata": {"full_name": "summerela/python_data_analysis", "description": "Introduction to Data Analysis with Python for UW Foster School of Business", "topics": [], "git_url": "git://github.com/summerela/python_data_analysis.git", "stars": 11, "watchers": 11, "forks": 27, "created": "2019-06-08T02:35:32Z", "size": 7972, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7883836}, "last_updated": "2020-11-09T16:54:13Z"}, "intent": "# plot the distribution of sample means"}, {"original_comment": "# Cluster and plot\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n#%%\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n#%%\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n#%%\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n#%%\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n#%%\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n#%%\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n#%%\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n#%%\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n#%%\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n#%%\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n#%%\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n#%%\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n#%%\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n#%%\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n#%%\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n#%%\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n#%%\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n#%%\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n#%%\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n#%%\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n#%%\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n#%%\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n#%%\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n#%%\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n#%%\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n#%%\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n#%%\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n#%%\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n#%%\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n#%%\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n#%%\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n#%%\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n#%%\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n#%%\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n#%%\n\ncr3.sort_values('index', inplace=True)\n\n#%%\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n#%%\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)", "target_code": "g = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='Reds')\nplt.show()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n\ncr3.sort_values('index', inplace=True)\n\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\n", "project_metadata": {"full_name": "illdopejake/Hippocampus_AP_Axis", "description": "Code used for Hippocampus Anterior/Posterior gene expression and neuroimaging analyses ", "topics": [], "git_url": "git://github.com/illdopejake/Hippocampus_AP_Axis.git", "stars": 7, "watchers": 7, "forks": 1, "created": "2018-05-20T18:18:47Z", "size": 149297, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 20340748, "Python": 58444, "Shell": 2454}, "last_updated": "2020-12-20T09:17:56Z"}, "intent": "# Cluster and plot"}, {"original_comment": "# This makes a white background with grid lines\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n#%%\n\nimport seaborn as sns", "target_code": "import seaborn as sns\n\nsns.set_style(\"whitegrid\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n\n", "project_metadata": {"full_name": "emonson/pandas-datamatters", "description": "Python for Tabular Data and Visualization \u2013 Data Matters 2020", "topics": [], "git_url": "git://github.com/emonson/pandas-datamatters.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-12-02T18:35:22Z", "size": 5862, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1794056}, "last_updated": "2021-01-05T16:21:04Z"}, "intent": "# Make a white background with grid lines"}, {"original_comment": "# print git status for the local repository\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n#%%\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n#%%\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n#%%\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n#%%\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n#%%\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)", "target_code": "status_command = \"git status\"\noutput = subprocess.check_output(status_command, shell=True)\nprint(output)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)\n", "project_metadata": {"full_name": "uzh/helmchen-spark", "description": "Playbooks and other files to build a (virtual) Spark cluster for Prof. Helmchen's research group", "topics": [], "git_url": "git://github.com/uzh/helmchen-spark.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2016-03-23T21:54:52Z", "size": 6519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2819538, "Python": 37375, "Shell": 3482}, "last_updated": "2019-12-15T16:09:17Z"}, "intent": "# print git status for the local repository"}, {"original_comment": "# Let's import the set\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n#%%", "target_code": "import pandas as pd\n\ndf = pd.read_csv('../data/train.csv', index_col=0)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n", "project_metadata": {"full_name": "commit-live-students/GLabs_DSMX", "description": null, "topics": [], "git_url": "git://github.com/commit-live-students/GLabs_DSMX.git", "stars": 6, "watchers": 6, "forks": 23, "created": "2020-03-27T12:43:39Z", "size": 19480, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12966885}, "last_updated": "2020-12-24T07:12:28Z"}, "intent": "# Let's import the set"}, {"original_comment": "# creating white noise\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n#%%\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n#%%\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n#%%\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n#%%\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nplt.imshow(img1)\n\n#%%\n\nplt.imshow(img2)\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\n# blending images of same size\n\n#%%\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n#%%\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nlarge_img = img1\nsmall_img = img2\n\n#%%\n\nx_offset = 0\ny_offset = 0\n\n#%%\n\n# in numpy x axis is vertical and y axis is horizontal\n\n#%%\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n#%%\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n#%%\n\n# Blend images of different sizes\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nimg1.shape\n\n#%%\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n#%%\n\nimg2.shape\n\n#%%\n\nrows, cols, channels = img2.shape\n\n#%%\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n#%%\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n#%%\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n#%%\n\nmask_inv.shape\n\n#%%\n\n# you can see the image is 2D now\n\n#%%\n\n\n\n#%%\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n#%%\n\nplt.imshow(white_bgd)\n\n#%%\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n#%%\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n#%%\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n#%%\n\nlarge_img = img1\nsmall_img = final_roi\n\n#%%\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n#%%\n\n\n\n#%%\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n#%%\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n#%%\n\nret\n\n#%%\n\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\nshow_img(img)\n\n#%%\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n#%%\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n#%%\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n#%%\n\n\n\n#%%\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\ni = load_img()\nshow_img(i)\n\n#%%\n\ngamma = 1/4\n\n#%%\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n#%%\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n#%%\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n#%%\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n#%%\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n#%%\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n#%%\n\nimg = load_img()\nshow_img(img)\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n#%%\n\nimg = load_img()", "target_code": "white_noise = np.random.randint(0, 2, size=(600, 600))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nplt.imshow(img1)\n\n\nplt.imshow(img2)\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\n# blending images of same size\n\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nlarge_img = img1\nsmall_img = img2\n\n\nx_offset = 0\ny_offset = 0\n\n\n# in numpy x axis is vertical and y axis is horizontal\n\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n\n# Blend images of different sizes\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nimg1.shape\n\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n\nimg2.shape\n\n\nrows, cols, channels = img2.shape\n\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n\nmask_inv.shape\n\n\n# you can see the image is 2D now\n\n\n\n\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n\nplt.imshow(white_bgd)\n\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n\nlarge_img = img1\nsmall_img = final_roi\n\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n\n\n\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n\nret\n\n\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\nshow_img(img)\n\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n\n\n\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\ni = load_img()\nshow_img(i)\n\n\ngamma = 1/4\n\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n\nimg = load_img()\nshow_img(img)\n\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n\nimg = load_img()\n", "project_metadata": {"full_name": "RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning", "description": null, "topics": [], "git_url": "git://github.com/RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning.git", "stars": 3, "watchers": 3, "forks": 5, "created": "2019-05-28T02:31:41Z", "size": 48363, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 98148466, "Python": 466}, "last_updated": "2020-12-21T09:24:02Z"}, "intent": "# creating white noise"}, {"original_comment": "# ## 5. Set up OneHot encoding parameters\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n#%%\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n#%%\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n#%%\n\ndf = df_orig.copy()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(5)\n\n#%%\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n#%%\n\n# Use function to add in indicators for presence of null values\n\n#%%\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n#%%\n\ndf = denote_null_values(df)\n\n#%%\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n#%%\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n#%%\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n#%%\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n#%%\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n#%%\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n#%%\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n#%%\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n#%%\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n#%%\n\nfloat(9.00000).is_integer()\n\n#%%\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n#%%\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n\n\n#%%\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n#%%\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n#%%\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n#%%\n\n# ?np.where\n\n#%%\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n#%%\n\ndf.head(5)\n\n#%%\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n#%%\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n#%%\n\ndf['MasVnrType'].value_counts()\n\n#%%\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n#%%\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n#%%\n\ndf[(df.BsmtQual_missing == True)]\n\n#%%\n\ndf.BsmtCond.value_counts()\n\n#%%\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n#%%\n\ndf.Electrical.value_counts()\n\n#%%\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n#%%\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n#%%\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n#%%\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n#%%\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n#%%\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n#%%\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n#%%\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n#%%\n\n# ## 2. Create additional bespoke data features\n\n#%%\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n#%%\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n#%%\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n#%%\n\ndf['Functional'].value_counts()\n\n#%%\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n#%%\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n#%%\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n#%%\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n#%%\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n#%%\n\ndf['PoolQC'].value_counts()\n\n#%%\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n#%%\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n#%%\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n#%%\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n#%%\n\ndf['Condition1'].value_counts()\n\n#%%\n\ndf['Condition2'].value_counts()\n\n#%%\n\ndf['Condition1']\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n#%%\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n#%%\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n#%%\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n#%%\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n#%%\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n#%%\n\ndf.head(10)\n\n#%%\n\n# ***\n# ## 4. Set up target encoding parameters\n\n#%%\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n#%%\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n#%%\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n#%%\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_oe.head(5)\n\n\n# ***", "target_code": "onehot_enc = ce.OneHotEncoder(verbose=1, cols=[\n 'Street', 'Alley', 'CentralAir', 'MiscFeature'], use_cat_names=True)\nonehot_enc.get_params()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n\ndf = df_orig.copy()\n\n\ndf.info()\n\n\ndf.head(5)\n\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n\n# Use function to add in indicators for presence of null values\n\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n\nfloat(9.00000).is_integer()\n\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n\n\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n\n# ?np.where\n\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n\ndf.head(5)\n\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n\ndf['MasVnrType'].value_counts()\n\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n\ndf[(df.BsmtQual_missing == True)]\n\n\ndf.BsmtCond.value_counts()\n\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n\ndf.Electrical.value_counts()\n\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n\n# ## 2. Create additional bespoke data features\n\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n\ndf['Functional'].value_counts()\n\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n\ndf['PoolQC'].value_counts()\n\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n\ndf['Condition1'].value_counts()\n\n\ndf['Condition2'].value_counts()\n\n\ndf['Condition1']\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n\ndf.head(10)\n\n\n# ***\n# ## 4. Set up target encoding parameters\n\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_oe.head(5)\n\n\n# ***\n\n\n\n", "project_metadata": {"full_name": "JonathanBechtel/DAT-10-19", "description": "GitHub Repo For DAT 10-19", "topics": [], "git_url": "git://github.com/JonathanBechtel/DAT-10-19.git", "stars": 2, "watchers": 2, "forks": 11, "created": "2020-10-19T14:53:15Z", "size": 108252, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 72671490, "HTML": 915086, "Python": 92446, "Shell": 222}, "last_updated": "2021-01-06T23:37:08Z"}, "intent": "# 5. Set up OneHot encoding parameters"}, {"original_comment": "# Summarize the new class distribution\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n#%%\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n#%%\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n#%%\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n#%%\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n#%%\n\n# Checking for data types\n# fraud_df.dtypes\n\n#%%\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n#%%\n\n#fraud_df = fraud_df.head(10000)\n\n#%%\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n#%%\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n#%%\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n#%%\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n#%%\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n#%%\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n#%%\n\nfraud_df[cat_col_onehotencode].head()\n\n#%%\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n#%%\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# fraud_df.columns\n\n#%%\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n#%%\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n#%%\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n#%%\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n#%%\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n#%%\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n#%%\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n#%%\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n#%%\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n#%%\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n#%%\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)", "target_code": "counter = Counter(y_train)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n\n# Checking for data types\n# fraud_df.dtypes\n\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n\n#fraud_df = fraud_df.head(10000)\n\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n\nfraud_df.head()\n\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n\nfraud_df[cat_col_onehotencode].head()\n\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n\nfraud_df.head()\n\n\n# fraud_df.columns\n\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "intent": "# Summarize the new class distribution"}, {"original_comment": "# remove reviews without audio features from Spotify\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n#%%\n\n# ## Data\n\n#%%\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n#%%\n\n# DATA_DF.content[0]\n\n#%%\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n#%%", "target_code": "DATA_DF = DATA_DF.loc[~DATA_DF.audio_features.isna()]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n\n# ## Data\n\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n\n# DATA_DF.content[0]\n\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n", "project_metadata": {"full_name": "iconix/openai", "description": "OpenAI Scholar, general materials", "topics": [], "git_url": "git://github.com/iconix/openai.git", "stars": 16, "watchers": 16, "forks": 3, "created": "2018-11-02T19:26:13Z", "size": 69033, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 22113154, "Python": 46353, "JavaScript": 8783, "Shell": 2297, "HTML": 970}, "last_updated": "2020-06-01T14:04:53Z"}, "intent": "# remove reviews without audio features from Spotify"}, {"original_comment": "# Loading in the data.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n#%%\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n#%%\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n#%%\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n#%%\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n#%%\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n#%%\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n#%%\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n#%%\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n#%%\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224", "target_code": "data = ImageClassifierData.from_paths(\n PATH, tfms=tfms_from_model(arch, sz), bs=64)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224\n", "project_metadata": {"full_name": "zachmonge/fish_computer_vision_example", "description": "This is the repository corresponding to my Medium blog post titled \"Does Deep Learning Really Require 'Big Data'? --No!\"", "topics": [], "git_url": "git://github.com/zachmonge/fish_computer_vision_example.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-08-20T03:51:12Z", "size": 8148, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1391423}, "last_updated": "2020-02-13T19:27:08Z"}, "intent": "# Loading in the data."}, {"original_comment": "# init the Support Vector Machine classifier\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%", "target_code": "from sklearn.svm import SVC\n\nsvm = SVC(kernel='linear', random_state=random_seed)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "intent": "# init the Support Vector Machine classifier"}, {"original_comment": "# Look globally at correlation of features.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n#%%\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n#%%\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n#%%\n\n# Check shape.\ncaravan_df_raw.shape\n\n#%%\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n#%%\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n#%%\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n#%%\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n#%%\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n#%%\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n#%%\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n#%%", "target_code": "corr = caravan_df_raw.corr()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n\n# Check shape.\ncaravan_df_raw.shape\n\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n", "project_metadata": {"full_name": "jonrossi/caravan-insurance", "description": "Exploration and analysis of the Caravan Insurance dataset", "topics": [], "git_url": "git://github.com/jonrossi/caravan-insurance.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-09-23T17:40:57Z", "size": 951, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1260942}, "last_updated": "2020-10-31T21:58:03Z"}, "intent": "# Look globally at correlation of features."}, {"original_comment": "# making predictions on the testing set\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n#%%\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n#%%\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n#%%\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n#%%\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n#%%\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n#%%\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n#%%\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n#%%\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n#%%\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n#%%\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n#%%\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n#%%\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')", "target_code": "y_pred = lr.predict(X_test)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')\n", "project_metadata": {"full_name": "asabenhur/CS345", "description": "Jupyter", "topics": [], "git_url": "git://github.com/asabenhur/CS345.git", "stars": 4, "watchers": 4, "forks": 11, "created": "2020-08-11T19:32:02Z", "size": 6413, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4808835}, "last_updated": "2020-12-30T20:50:00Z"}, "intent": "# making predictions on the testing set"}, {"original_comment": "# changing the datatype\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n#%%\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n#%%\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n#%%\n\ndata.dtypes\n\n#%%\n\ndata.size\n\n#%%\n\ndata.info()\n\n#%%\n\ndata['RAM'].describe()\n\n#%%\n\ndata.describe()\n\n#%%\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n#%%\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n#%%\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n#%%\n\nlen(df)\n\n#%%\n\ndf.fillna(np.nan)\ndf\n\n#%%\n\ndf_dropped = df.dropna()\ndf_dropped\n\n#%%\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n#%%\n\ndf.head(2)\n\n#%%\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n#%%\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking unique values\ndfm['GPRS'].unique()\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n#%%\n\n# count of every column\ndfm.count()\n\n#%%\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n#%%\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n#%%\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n#%%\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n#%%\n\ndf1 = df_dropped\nlen(df1)\n\n#%%\n\n# checking the datatypes\ndf1.dtypes\n\n#%%\n\n# displaying info\ndf1.info()\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf1.tail()\n\n#%%\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n#%%\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n#%%", "target_code": "df1[['display_size']] = df1[['display_size']].apply(pd.to_numeric)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n\ndata.dtypes\n\n\ndata.size\n\n\ndata.info()\n\n\ndata['RAM'].describe()\n\n\ndata.describe()\n\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n\ndf.info()\n\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n\nlen(df)\n\n\ndf.fillna(np.nan)\ndf\n\n\ndf_dropped = df.dropna()\ndf_dropped\n\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n\ndf.head(2)\n\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n\n# checking unique values\ndfm['GPRS'].unique()\n\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n\n# checking data types\ndfm.dtypes\n\n\n# count of every column\ndfm.count()\n\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n\ndf1 = df_dropped\nlen(df1)\n\n\n# checking the datatypes\ndf1.dtypes\n\n\n# displaying info\ndf1.info()\n\n\ndf1.head()\n\n\ndf1.tail()\n\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n", "project_metadata": {"full_name": "yatinagg/Mobile_Price_Classification", "description": "Dataritz Phone Price Classification", "topics": [], "git_url": "git://github.com/yatinagg/Mobile_Price_Classification.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-05-18T15:04:18Z", "size": 6525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4727409, "Python": 568}, "last_updated": "2020-08-30T08:37:02Z"}, "intent": "# changing the datatype"}, {"original_comment": "# train RMSE, MSE\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n#%%\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n#%%\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n#%%\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n#%%\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n#%%\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n#%%\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n#%%\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n#%%\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n#%%\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n#%%\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n#%%\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n#%%\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n#%%\n\ntrainingSummary = model.summary", "target_code": "print(\"RMSE: {}\".format(trainingSummary.rootMeanSquaredError))\nprint(\"MSE: {}\".format(trainingSummary.meanSquaredError))\nprint(\"R2: {}\".format(trainingSummary.r2))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n\ntrainingSummary = model.summary\n", "project_metadata": {"full_name": "dangkhoadl/my-BigData", "description": "A cache to store my Distributed System and Big Data resources", "topics": ["big-data", "coursera", "operating-systems", "distributed-systems", "cloud-computing"], "git_url": "git://github.com/dangkhoadl/my-BigData.git", "stars": 7, "watchers": 7, "forks": 8, "created": "2017-12-23T05:56:43Z", "size": 49086, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 888066, "C++": 48288, "Shell": 6317, "Python": 3334, "Makefile": 990}, "last_updated": "2020-01-21T03:30:03Z"}, "intent": "# train RMSE, MSE"}, {"original_comment": "# Second Convolutional layer.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n#%%\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n#%%\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n#%%\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n#%%\n\ninput_shape\n\n#%%\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n#%%\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n#%%\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n#%%\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n#%%\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n#%%\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n#%%\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))", "target_code": "new_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n\ninput_shape\n\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# Second Convolutional layer."}, {"original_comment": "# ## Prepare a train/test set for Validating Stacking\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n#%%\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n#%%\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n#%%\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n#%%\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n#%%\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n#%%\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n#%%\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n#%%\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)", "target_code": "from sklearn.model_selection import train_test_split\n\ntrain_ = train.fillna(-999)\ntest_ = test.fillna(-999)\ntraining, testing = train_test_split(train_, test_size=0.2, random_state=42)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)\n\n\n\n", "project_metadata": {"full_name": "liujiashen9307/KaggleCompetition", "description": "Code hub for the kaggle competitions I have participated in.", "topics": [], "git_url": "git://github.com/liujiashen9307/KaggleCompetition.git", "stars": 6, "watchers": 6, "forks": 10, "created": "2016-10-12T21:10:54Z", "size": 15258, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16811198, "HTML": 14162298, "Python": 1658600, "R": 8306}, "last_updated": "2020-02-01T03:33:11Z"}, "intent": "# Prepare a train/test set for Validating Stacking"}, {"original_comment": "# ### Compute the explained variance for new data set.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata[data.columns[data.dtypes == int]]\n\n#%%\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n#%%\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)", "target_code": "pca.get_covariance()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata[data.columns[data.dtypes == int]]\n\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)\n\n\n\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# Compute the explained variance for new data set."}, {"original_comment": "# Test that the path can be parsed successfully\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n#%%\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n#%%\n\ncelldom.get_repo_dir()\n\n#%%\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n#%%\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n#%%\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n#%%\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n#%%\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n#%%", "target_code": "exp_config.parse_path(test_path)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n\ncelldom.get_repo_dir()\n\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "intent": "# Test that the path can be parsed successfully"}, {"original_comment": "# compling and show model\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n#%%\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import optimizers\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n#%%\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n#%%\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n#%%\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n#%%\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n#%%\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n#%%\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))", "target_code": "from keras import optimizers\n\nmodel.compile(loss=\"categorical_crossentropy\", optimizer=optimizers.SGD(\n lr=0.0001, momentum=0.9), metrics=[\"accuracy\"])\nmodel.summary()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))\n", "project_metadata": {"full_name": "WuZhuoran/Plant_Seedlings_Classification", "description": "Kaggle Competition Project as well as ANLY 590 Final Project. Task: Determine the species of a seedling from an image", "topics": [], "git_url": "git://github.com/WuZhuoran/Plant_Seedlings_Classification.git", "stars": 10, "watchers": 10, "forks": 7, "created": "2018-10-31T01:19:27Z", "size": 10167, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2140227, "Python": 31477}, "last_updated": "2020-12-18T16:42:52Z"}, "intent": "# compling and show model"}, {"original_comment": "# Area under the ROC\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n#%%\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n#%%\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n#%%\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n#%%\n\n# View one of the records\ndata.take(1)\n\n#%%\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n#%%\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n#%%\n\nlr_model = lr.fit(train)\n\n#%%\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n#%%\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n#%%\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n#%%\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n#%%", "target_code": "print(\"Area Under ROC = %.2f\" % metrics.areaUnderROC)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n\n# View one of the records\ndata.take(1)\n\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n\nlr_model = lr.fit(train)\n\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "intent": "# Area under the ROC"}, {"original_comment": "# ## Extract Feature\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\nfrom extract_feature import FeatureExtractor\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n#%%\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()", "target_code": "from extract_feature import FeatureExtractor\n\nFE = FeatureExtractor(model_name='xlm-r')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()\n\n\n\n", "project_metadata": {"full_name": "ilhamfp/indonesian-text-classification-multilingual", "description": "Improving Indonesian text classification using multilingual language model", "topics": ["multilingual-language-model", "text-classification", "indonesian-language", "indonesian-text-classification", "sentiment-analysis", "hate-speech-detection", "language-model", "multilingual", "zero-shot", "monolingual", "cross-lingual-transfer", "multilingual-language-models", "indonesian-data", "english-language"], "git_url": "git://github.com/ilhamfp/indonesian-text-classification-multilingual.git", "stars": 7, "watchers": 7, "forks": 0, "created": "2020-04-26T07:27:39Z", "size": 15604, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3476215, "Python": 28982}, "last_updated": "2020-12-20T17:12:07Z"}, "intent": "# Extract Feature"}, {"original_comment": "# Feature selection: remove variables no longer containing relevant information\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n#%%\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n#%%\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\n#%%", "target_code": "train = train.drop(drop_elements, axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\ndrop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# Feature selection: remove variables no longer containing relevant information"}, {"original_comment": "# load and view a single file\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n#%%\n\nfrom sklearn.metrics import make_scorer\nfrom sklearn import tree\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n#%%\n\nget_ipython().system(' pip install --upgrade pandas')\n\n#%%\n\n# if working on google colab\n#! pip install -U -q PyDrive\n#from google.colab import drive\n# drive.mount('/content/gdrive')\n\n#%%\n\nget_ipython().system('pip install --user mlmicrophysics==0.1.1')\nget_ipython().system('pip install --upgrade pypi')\n\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n#%%\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n#%%\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n#%%\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n#%%\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\nprint(out_scales_df)\n\n#%%\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n#%%\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n#%%", "target_code": "single_file = pd.read_parquet(fobj).set_index('Index')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n\nfrom sklearn.metrics import make_scorer\nfrom sklearn import tree\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# if working on google colab\n#! pip install -U -q PyDrive\n#from google.colab import drive\n# drive.mount('/content/gdrive')\n\n\nget_ipython().system('pip install --user mlmicrophysics==0.1.1')\nget_ipython().system('pip install --upgrade pypi')\n\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\nprint(out_scales_df)\n\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "intent": "# load and view a single file"}, {"original_comment": "# Convert to boolean and print count\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport torch.nn as nn\nimport torch\nfrom torch.autograd import Variable\nfrom matplotlib import pyplot as plt\nimport torch.nn.functional as F\nimport time\nfrom statistics import mean\nfrom sklearn import preprocessing\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.preprocessing import scale\nfrom sklearn import metrics\n\n#%%\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n#%%\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_medium.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))", "target_code": "df1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport torch.nn as nn\nimport torch\nfrom torch.autograd import Variable\nfrom matplotlib import pyplot as plt\nimport torch.nn.functional as F\nimport time\nfrom statistics import mean\nfrom sklearn import preprocessing\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.preprocessing import scale\nfrom sklearn import metrics\n\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_medium.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n", "project_metadata": {"full_name": "abhishek3aj/ML1819--task-101--team-06", "description": "ML framework comparison", "topics": [], "git_url": "git://github.com/abhishek3aj/ML1819--task-101--team-06.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-10-09T09:48:20Z", "size": 21107, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4638466, "Python": 84406}, "last_updated": "2018-12-17T19:27:23Z"}, "intent": "# Convert to boolean"}, {"original_comment": " # save all train test results\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom Resnet import resnet10\nimport os\nimport pickle\nimport torch\nimport torchvision\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\nfrom PIL import Image\nfrom tqdm import tqdm\nimport torch.nn as nn\nimport torch.utils.data as data1\nfrom torch.utils import data\nimport torch.nn.functional as F\nimport torchvision.models as models\nimport matplotlib.pyplot as plt\nfrom torch.autograd import Variable\nfrom sklearn.metrics import accuracy_score\nimport torchvision.transforms as transforms\nfrom sklearn.preprocessing import OneHotEncoder, LabelEncoder\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\n\n#%%\n\nfrom torch.utils import data\n\n\nclass Dataload_3D_CNN(data.Dataset):\n \"Characterizes a dataset for PyTorch\"\n\n def __init__(self, data_path, transform=None):\n \"Initialization\"\n self.transform = transform\n #self.frames = frames\n self.folders = data_path\n\n def __len__(self):\n \"Denotes the total number of samples\"\n return len(os.listdir(self.folders))\n\n def read_images(self, data_path, use_transform):\n X = []\n for i in os.listdir(data_path):\n #print(\"file name is \",i)\n image = Image.open(os.path.join(data_path, i))\n\n # print(image.shape)\n if use_transform is not None:\n image = use_transform(image)\n # print(image.size)\n image = torch.from_numpy(np.asarray(image))\n X.append(image)\n # print(X)\n #X = np.array(X)\n X = torch.stack(X, dim=0)\n\n return X\n\n def __getitem__(self, index):\n \"Generates one sample of data\"\n # Select sample\n #print(\"index passed is \",index)\n # print(self.folders)\n data_path = os.path.join(self.folders, os.listdir(self.folders)[index])\n #data_path = self.folders+ str(index)\n #print(\"Data path is \",data_path)\n\n # Load data\n # (input) spatial images\n X = self.read_images(data_path, self.transform)\n\n y = 1\n if 'orig' in data_path:\n y = 0\n # print(X.shape)\n return X, torch.from_numpy(np.array(y)).type(torch.LongTensor)\n\n#%%\n\nTRANSFORM_IMG = transforms.Compose([\n transforms.Resize(128),\n # transforms.CenterCrop(256),\n # transforms.ToTensor()\n # transforms.Normalize(mean=[0.485, 0.456, 0.406],\n # std=[0.229, 0.224, 0.225] )\n])\n\n#%%\n\ntrain_path = '/home/chinmay/datatset/train/'\ntrain_data = Dataload_3D_CNN(train_path, transform=TRANSFORM_IMG)\n# for step, (x, y) in enumerate(data):\n# print(x.shape)\nval_path = '/home/chinmay/datatset/val/'\nval_data = Dataload_3D_CNN(val_path, transform=TRANSFORM_IMG)\n\n#%%\n\nepochs = 20\nbatch_size = 64\nlearning_rate = 6 * 1e-5\nlog_interval = 10\nimg_x, img_y = 96, 96 # 128,128#256, 256 # resize video 2d frame size\n\n#%%\n\n# Detect devices\nuse_cuda = torch.cuda.is_available() # check if GPU exists\ndevice = torch.device(\"cuda\" if use_cuda else \"cpu\") # use CPU or GPU\nprint(\"Is use_cuda\", use_cuda)\n# Now load the dataset\nparams = {'batch_size': batch_size, 'shuffle': True,\n 'num_workers': 4, 'pin_memory': True} if use_cuda else {}\n# Load the dataset\n\ntrain_loader = data1.DataLoader(train_data, **params)\nvalid_loader = data1.DataLoader(val_data, **params)\n\n#%%\n\n## ------------------------ 3D CNN module ---------------------- ##\ndef conv3D_output_size(img_size, padding, kernel_size, stride):\n # print(\"Image size is \",img_size)\n # compute output shape of conv3D\n outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),\n np.floor((img_size[1] + 2 * padding[1] -\n (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int),\n np.floor((img_size[2] + 2 * padding[2] - (kernel_size[2] - 1) - 1) / stride[1] + 1).astype(int))\n# print(\"The output shape is \",outshape)\n return outshape\n\n\nclass CNN3D(nn.Module):\n def __init__(self, t_dim=120, img_x=90, img_y=120, drop_p=0.2, fc_hidden1=256, fc_hidden2=128, fc_hidden3=64, num_classes=2):\n super(CNN3D, self).__init__()\n\n # set video dimension\n self.t_dim = t_dim\n self.img_x = img_x\n self.img_y = img_y\n # fully connected layer hidden nodes\n self.fc_hidden1, self.fc_hidden2, self.fc_hidden3 = fc_hidden1, fc_hidden2, fc_hidden3\n self.drop_p = drop_p\n self.num_classes = num_classes\n self.ch1, self.ch2, self.ch3 = 32, 48, 64\n self.k1, self.k2, self.k3 = (\n 1, 3, 3), (1, 3, 3), (1, 3, 3) # 3d kernel size\n self.s1, self.s2, self.s3 = (\n 1, 1, 1), (1, 1, 1), (1, 1, 1) # 3d strides\n self.pd1, self.pd2, self.pd3 = (\n 0, 0, 0), (0, 0, 0), (0, 0, 0) # 3d padding\n\n # compute conv1 & conv2 output shape\n self.conv1_outshape = conv3D_output_size(\n (3, self.img_x, self.img_y), self.pd1, self.k1, self.s1)\n self.conv2_outshape = conv3D_output_size(\n self.conv1_outshape, self.pd2, self.k2, self.s2)\n self.conv3_outshape = conv3D_output_size(\n self.conv2_outshape, self.pd3, self.k3, self.s3)\n\n self.conv1 = nn.Conv3d(in_channels=10, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1,\n padding=self.pd1)\n self.bn1 = nn.BatchNorm3d(self.ch1)\n self.conv2 = nn.Conv3d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2,\n padding=self.pd2)\n self.bn2 = nn.BatchNorm3d(self.ch2)\n self.conv3 = nn.Conv3d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3,\n padding=self.pd3)\n self.bn3 = nn.BatchNorm3d(self.ch3)\n\n self.relu = nn.ReLU(inplace=True)\n self.drop = nn.Dropout3d(self.drop_p)\n self.pool = nn.MaxPool3d((1, 2, 2))\n # Combining into linear layers now\n\n # * self.conv2_outshape[2],\n self.fc1 = nn.Linear(\n self.ch3 * self.conv3_outshape[0] * self.conv3_outshape[1] * self.conv3_outshape[2], self.fc_hidden1)\n # fully connected hidden layer\n self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)\n # fully connected layer, output = multi-classes\n self.fc3 = nn.Linear(self.fc_hidden2, self.num_classes)\n # Only using two fc layer\n\n def forward(self, x_3d):\n # Conv 1\n # print(x_3d.shape)\n x_3d = x_3d.type(torch.cuda.FloatTensor)\n x_3d = x_3d.permute(0, 1, 4, 2, 3)\n # Permuting shape so that it matches the format for conv3D\n # print(x_3d.shape)\n # print(self.conv1)\n x = self.conv1(x_3d)\n x = self.bn1(x)\n x = self.relu(x)\n x = self.drop(x)\n # Conv 2\n x = self.conv2(x)\n x = self.bn2(x)\n x = self.relu(x)\n x = self.drop(x)\n # Conv 3\n x = self.conv3(x)\n x = self.bn3(x)\n x = self.relu(x)\n x = self.drop(x)\n # FC 1 and 2\n #print(\"Fully connected \",x.shape)\n x = x.view(x.size(0), -1)\n x = F.relu(self.fc1(x))\n x = F.relu(self.fc2(x))\n x = F.dropout(x, p=self.drop_p, training=self.training)\n x = self.fc3(x)\n\n return x\n\n#%%\n\n# set path\n\nsave_model_path = \"/home/chinmay/datatset/save_model/Conv3D_ckpt/\" # save Pytorch models\n\n\n# 3D CNN parameters\nfc_hidden1, fc_hidden2 = 256, 256\ndropout = 0.0 # dropout probability\n\n\n# Select which frame to begin & end in videos\nbegin_frame, end_frame, skip_frame = 1, 10, 1\n\n\ndef train(log_interval, model, device, train_loader, optimizer, epoch):\n # set model as training mode\n model.train()\n\n losses = []\n scores = []\n N_count = 0 # counting total trained sample in one epoch\n for batch_idx, (X, y) in enumerate(train_loader):\n # distribute data to device\n #X, y = X.to(device), y.to(device)\n X, y = X.cuda(), y.cuda()\n #print(\"The label is \",y)\n N_count += X.size(0)\n #print(\"The size is \",X.size())\n optimizer.zero_grad()\n output = model(X) # output size = (batch, number of classes)\n\n loss = F.binary_cross_entropy(output, y)\n losses.append(loss.item())\n #print(\"The loss is \",loss.item())\n # to compute accuracy\n# print(\"The output is \", output)\n y_pred = torch.max(output, 1)[1]\n #print(\"The label predicted is \",y_pred)\n step_score = accuracy_score(y.cpu().data.squeeze(\n ).numpy(), y_pred.cpu().data.squeeze().numpy())\n scores.append(step_score) # computed on CPU\n\n loss.backward()\n optimizer.step()\n\n # show information\n if (batch_idx + 1) % log_interval == 0:\n print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}, Accu: {:.2f}%'.format(\n epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))\n # torch.cuda.empty_cache()\n return losses, scores\n\n#%%\n\ndef validation(model, device, optimizer, test_loader):\n # set model as testing mode\n model.eval()\n\n test_loss = 0\n all_y = []\n all_y_pred = []\n with torch.no_grad():\n for X, y in test_loader:\n # distribute data to device\n #X, y = X.to(device), y.to(device)\n X, y = X.cuda(), y.cuda()\n output = model(X)\n\n loss = F.cross_entropy(output, y, reduction='sum')\n test_loss += loss.item() # sum up batch loss\n # (y_pred != output) get the index of the max log-probability\n y_pred = torch.max(output, 1)[1]\n\n # torch.from_numpy(np.asarray(y_pred))\n # collect all y and y_pred in all batches\n all_y.extend(y)\n all_y_pred.extend(y_pred)\n\n test_loss /= len(test_loader.dataset)\n\n # to compute accuracy\n# all_y = torch.stack(all_y, dim=0)\n# all_y_pred = torch.stack(all_y_pred, dim=0)\n all_y = torch.from_numpy(np.asarray(all_y))\n all_y_pred = torch.from_numpy(np.asarray(all_y_pred))\n test_score = accuracy_score(all_y.cpu().data.squeeze(\n ).numpy(), all_y_pred.cpu().data.squeeze().numpy())\n\n # show information\n print('\\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\\n'.format(\n len(all_y), test_loss, 100 * test_score))\n\n # save Pytorch models of best record\n torch.save(model.state_dict(), os.path.join(save_model_path,\n '3dcnn_epoch{}.pth'.format(epoch + 1))) # save spatial_encoder\n torch.save(optimizer.state_dict(), os.path.join(save_model_path,\n '3dcnn_optimizer_epoch{}.pth'.format(epoch + 1))) # save optimizer\n print(\"Epoch {} model saved!\".format(epoch + 1))\n\n return test_loss, test_score\n\n#%%\n\n##########################################\n# Be careful in running this cell\n# Skip it most of the times\n##########################################\ncnn3d = resnet10(sample_size=128, in_channels=10)\npre_load = True\nmodel_name = \"3dcnn_epoch40.pth\"\nif pre_load:\n model_path = save_model_path + model_name\n cnn3d.load_state_dict(torch.load(model_path))\n save_model_path = save_model_path + \"temp/\"\ncnn3d.cuda()\nprint(cnn3d)\n\n#%%\n\n# create model\n# cnn3d = CNN3D(t_dim=10, img_x=img_x, img_y=img_y,\n# drop_p=dropout, fc_hidden1=fc_hidden1, fc_hidden2=fc_hidden2, num_classes=2)\n\n# print(cnn3d.conv1)\n# Parallelize model to multiple GPUs\nif torch.cuda.device_count() > 1:\n print(\"Using\", torch.cuda.device_count(), \"GPUs!\")\n cnn3d = nn.DataParallel(cnn3d)\n\noptimizer = torch.optim.Adam(cnn3d.parameters(\n), lr=learning_rate, weight_decay=1e-6) # optimize all cnn parameters\n\n\n# record training process\nepoch_train_losses = []\nepoch_train_scores = []\nepoch_test_losses = []\nepoch_test_scores = []\n\n\n# start training\nfor epoch in range(epochs):\n # train, test model\n train_losses, train_scores = train(\n log_interval, cnn3d, device, train_loader, optimizer, epoch)\n epoch_test_loss, epoch_test_score = validation(\n cnn3d, device, optimizer, valid_loader)\n\n # save results\n epoch_train_losses.append(np.mean(train_losses))\n epoch_train_scores.append(np.mean(train_scores))\n # For validation set, it is already averaged\n epoch_test_losses.append(epoch_test_loss)\n epoch_test_scores.append(epoch_test_score)", "target_code": " A = np.array(epoch_train_losses)\n B = np.array(epoch_train_scores)\n C = np.array(epoch_test_losses)\n D = np.array(epoch_test_scores)\n np.save('./3DCNN_epoch_training_losses.npy', A)\n np.save('./3DCNN_epoch_training_scores.npy', B)\n np.save('./3DCNN_epoch_test_loss.npy', C)\n np.save('./3DCNN_epoch_test_score.npy', D)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom Resnet import resnet10\nimport os\nimport pickle\nimport torch\nimport torchvision\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\nfrom PIL import Image\nfrom tqdm import tqdm\nimport torch.nn as nn\nimport torch.utils.data as data1\nfrom torch.utils import data\nimport torch.nn.functional as F\nimport torchvision.models as models\nimport matplotlib.pyplot as plt\nfrom torch.autograd import Variable\nfrom sklearn.metrics import accuracy_score\nimport torchvision.transforms as transforms\nfrom sklearn.preprocessing import OneHotEncoder, LabelEncoder\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\n\n\nfrom torch.utils import data\n\n\nclass Dataload_3D_CNN(data.Dataset):\n \"Characterizes a dataset for PyTorch\"\n\n def __init__(self, data_path, transform=None):\n \"Initialization\"\n self.transform = transform\n #self.frames = frames\n self.folders = data_path\n\n def __len__(self):\n \"Denotes the total number of samples\"\n return len(os.listdir(self.folders))\n\n def read_images(self, data_path, use_transform):\n X = []\n for i in os.listdir(data_path):\n #print(\"file name is \",i)\n image = Image.open(os.path.join(data_path, i))\n\n # print(image.shape)\n if use_transform is not None:\n image = use_transform(image)\n # print(image.size)\n image = torch.from_numpy(np.asarray(image))\n X.append(image)\n # print(X)\n #X = np.array(X)\n X = torch.stack(X, dim=0)\n\n return X\n\n def __getitem__(self, index):\n \"Generates one sample of data\"\n # Select sample\n #print(\"index passed is \",index)\n # print(self.folders)\n data_path = os.path.join(self.folders, os.listdir(self.folders)[index])\n #data_path = self.folders+ str(index)\n #print(\"Data path is \",data_path)\n\n # Load data\n # (input) spatial images\n X = self.read_images(data_path, self.transform)\n\n y = 1\n if 'orig' in data_path:\n y = 0\n # print(X.shape)\n return X, torch.from_numpy(np.array(y)).type(torch.LongTensor)\n\n\nTRANSFORM_IMG = transforms.Compose([\n transforms.Resize(128),\n # transforms.CenterCrop(256),\n # transforms.ToTensor()\n # transforms.Normalize(mean=[0.485, 0.456, 0.406],\n # std=[0.229, 0.224, 0.225] )\n])\n\n\ntrain_path = '/home/chinmay/datatset/train/'\ntrain_data = Dataload_3D_CNN(train_path, transform=TRANSFORM_IMG)\n# for step, (x, y) in enumerate(data):\n# print(x.shape)\nval_path = '/home/chinmay/datatset/val/'\nval_data = Dataload_3D_CNN(val_path, transform=TRANSFORM_IMG)\n\n\nepochs = 20\nbatch_size = 64\nlearning_rate = 6 * 1e-5\nlog_interval = 10\nimg_x, img_y = 96, 96 # 128,128#256, 256 # resize video 2d frame size\n\n\n# Detect devices\nuse_cuda = torch.cuda.is_available() # check if GPU exists\ndevice = torch.device(\"cuda\" if use_cuda else \"cpu\") # use CPU or GPU\nprint(\"Is use_cuda\", use_cuda)\n# Now load the dataset\nparams = {'batch_size': batch_size, 'shuffle': True,\n 'num_workers': 4, 'pin_memory': True} if use_cuda else {}\n# Load the dataset\n\ntrain_loader = data1.DataLoader(train_data, **params)\nvalid_loader = data1.DataLoader(val_data, **params)\n\n\n## ------------------------ 3D CNN module ---------------------- ##\ndef conv3D_output_size(img_size, padding, kernel_size, stride):\n # print(\"Image size is \",img_size)\n # compute output shape of conv3D\n outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),\n np.floor((img_size[1] + 2 * padding[1] -\n (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int),\n np.floor((img_size[2] + 2 * padding[2] - (kernel_size[2] - 1) - 1) / stride[1] + 1).astype(int))\n# print(\"The output shape is \",outshape)\n return outshape\n\n\nclass CNN3D(nn.Module):\n def __init__(self, t_dim=120, img_x=90, img_y=120, drop_p=0.2, fc_hidden1=256, fc_hidden2=128, fc_hidden3=64, num_classes=2):\n super(CNN3D, self).__init__()\n\n # set video dimension\n self.t_dim = t_dim\n self.img_x = img_x\n self.img_y = img_y\n # fully connected layer hidden nodes\n self.fc_hidden1, self.fc_hidden2, self.fc_hidden3 = fc_hidden1, fc_hidden2, fc_hidden3\n self.drop_p = drop_p\n self.num_classes = num_classes\n self.ch1, self.ch2, self.ch3 = 32, 48, 64\n self.k1, self.k2, self.k3 = (\n 1, 3, 3), (1, 3, 3), (1, 3, 3) # 3d kernel size\n self.s1, self.s2, self.s3 = (\n 1, 1, 1), (1, 1, 1), (1, 1, 1) # 3d strides\n self.pd1, self.pd2, self.pd3 = (\n 0, 0, 0), (0, 0, 0), (0, 0, 0) # 3d padding\n\n # compute conv1 & conv2 output shape\n self.conv1_outshape = conv3D_output_size(\n (3, self.img_x, self.img_y), self.pd1, self.k1, self.s1)\n self.conv2_outshape = conv3D_output_size(\n self.conv1_outshape, self.pd2, self.k2, self.s2)\n self.conv3_outshape = conv3D_output_size(\n self.conv2_outshape, self.pd3, self.k3, self.s3)\n\n self.conv1 = nn.Conv3d(in_channels=10, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1,\n padding=self.pd1)\n self.bn1 = nn.BatchNorm3d(self.ch1)\n self.conv2 = nn.Conv3d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2,\n padding=self.pd2)\n self.bn2 = nn.BatchNorm3d(self.ch2)\n self.conv3 = nn.Conv3d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3,\n padding=self.pd3)\n self.bn3 = nn.BatchNorm3d(self.ch3)\n\n self.relu = nn.ReLU(inplace=True)\n self.drop = nn.Dropout3d(self.drop_p)\n self.pool = nn.MaxPool3d((1, 2, 2))\n # Combining into linear layers now\n\n # * self.conv2_outshape[2],\n self.fc1 = nn.Linear(\n self.ch3 * self.conv3_outshape[0] * self.conv3_outshape[1] * self.conv3_outshape[2], self.fc_hidden1)\n # fully connected hidden layer\n self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)\n # fully connected layer, output = multi-classes\n self.fc3 = nn.Linear(self.fc_hidden2, self.num_classes)\n # Only using two fc layer\n\n def forward(self, x_3d):\n # Conv 1\n # print(x_3d.shape)\n x_3d = x_3d.type(torch.cuda.FloatTensor)\n x_3d = x_3d.permute(0, 1, 4, 2, 3)\n # Permuting shape so that it matches the format for conv3D\n # print(x_3d.shape)\n # print(self.conv1)\n x = self.conv1(x_3d)\n x = self.bn1(x)\n x = self.relu(x)\n x = self.drop(x)\n # Conv 2\n x = self.conv2(x)\n x = self.bn2(x)\n x = self.relu(x)\n x = self.drop(x)\n # Conv 3\n x = self.conv3(x)\n x = self.bn3(x)\n x = self.relu(x)\n x = self.drop(x)\n # FC 1 and 2\n #print(\"Fully connected \",x.shape)\n x = x.view(x.size(0), -1)\n x = F.relu(self.fc1(x))\n x = F.relu(self.fc2(x))\n x = F.dropout(x, p=self.drop_p, training=self.training)\n x = self.fc3(x)\n\n return x\n\n\n# set path\n\nsave_model_path = \"/home/chinmay/datatset/save_model/Conv3D_ckpt/\" # save Pytorch models\n\n\n# 3D CNN parameters\nfc_hidden1, fc_hidden2 = 256, 256\ndropout = 0.0 # dropout probability\n\n\n# Select which frame to begin & end in videos\nbegin_frame, end_frame, skip_frame = 1, 10, 1\n\n\ndef train(log_interval, model, device, train_loader, optimizer, epoch):\n # set model as training mode\n model.train()\n\n losses = []\n scores = []\n N_count = 0 # counting total trained sample in one epoch\n for batch_idx, (X, y) in enumerate(train_loader):\n # distribute data to device\n #X, y = X.to(device), y.to(device)\n X, y = X.cuda(), y.cuda()\n #print(\"The label is \",y)\n N_count += X.size(0)\n #print(\"The size is \",X.size())\n optimizer.zero_grad()\n output = model(X) # output size = (batch, number of classes)\n\n loss = F.binary_cross_entropy(output, y)\n losses.append(loss.item())\n #print(\"The loss is \",loss.item())\n # to compute accuracy\n# print(\"The output is \", output)\n y_pred = torch.max(output, 1)[1]\n #print(\"The label predicted is \",y_pred)\n step_score = accuracy_score(y.cpu().data.squeeze(\n ).numpy(), y_pred.cpu().data.squeeze().numpy())\n scores.append(step_score) # computed on CPU\n\n loss.backward()\n optimizer.step()\n\n # show information\n if (batch_idx + 1) % log_interval == 0:\n print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}, Accu: {:.2f}%'.format(\n epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))\n # torch.cuda.empty_cache()\n return losses, scores\n\n\ndef validation(model, device, optimizer, test_loader):\n # set model as testing mode\n model.eval()\n\n test_loss = 0\n all_y = []\n all_y_pred = []\n with torch.no_grad():\n for X, y in test_loader:\n # distribute data to device\n #X, y = X.to(device), y.to(device)\n X, y = X.cuda(), y.cuda()\n output = model(X)\n\n loss = F.cross_entropy(output, y, reduction='sum')\n test_loss += loss.item() # sum up batch loss\n # (y_pred != output) get the index of the max log-probability\n y_pred = torch.max(output, 1)[1]\n\n # torch.from_numpy(np.asarray(y_pred))\n # collect all y and y_pred in all batches\n all_y.extend(y)\n all_y_pred.extend(y_pred)\n\n test_loss /= len(test_loader.dataset)\n\n # to compute accuracy\n# all_y = torch.stack(all_y, dim=0)\n# all_y_pred = torch.stack(all_y_pred, dim=0)\n all_y = torch.from_numpy(np.asarray(all_y))\n all_y_pred = torch.from_numpy(np.asarray(all_y_pred))\n test_score = accuracy_score(all_y.cpu().data.squeeze(\n ).numpy(), all_y_pred.cpu().data.squeeze().numpy())\n\n # show information\n print('\\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\\n'.format(\n len(all_y), test_loss, 100 * test_score))\n\n # save Pytorch models of best record\n torch.save(model.state_dict(), os.path.join(save_model_path,\n '3dcnn_epoch{}.pth'.format(epoch + 1))) # save spatial_encoder\n torch.save(optimizer.state_dict(), os.path.join(save_model_path,\n '3dcnn_optimizer_epoch{}.pth'.format(epoch + 1))) # save optimizer\n print(\"Epoch {} model saved!\".format(epoch + 1))\n\n return test_loss, test_score\n\n\n##########################################\n# Be careful in running this cell\n# Skip it most of the times\n##########################################\ncnn3d = resnet10(sample_size=128, in_channels=10)\npre_load = True\nmodel_name = \"3dcnn_epoch40.pth\"\nif pre_load:\n model_path = save_model_path + model_name\n cnn3d.load_state_dict(torch.load(model_path))\n save_model_path = save_model_path + \"temp/\"\ncnn3d.cuda()\nprint(cnn3d)\n\n\n# create model\n# cnn3d = CNN3D(t_dim=10, img_x=img_x, img_y=img_y,\n# drop_p=dropout, fc_hidden1=fc_hidden1, fc_hidden2=fc_hidden2, num_classes=2)\n\n# print(cnn3d.conv1)\n# Parallelize model to multiple GPUs\nif torch.cuda.device_count() > 1:\n print(\"Using\", torch.cuda.device_count(), \"GPUs!\")\n cnn3d = nn.DataParallel(cnn3d)\n\noptimizer = torch.optim.Adam(cnn3d.parameters(\n), lr=learning_rate, weight_decay=1e-6) # optimize all cnn parameters\n\n\n# record training process\nepoch_train_losses = []\nepoch_train_scores = []\nepoch_test_losses = []\nepoch_test_scores = []\n\n\n# start training\nfor epoch in range(epochs):\n # train, test model\n train_losses, train_scores = train(\n log_interval, cnn3d, device, train_loader, optimizer, epoch)\n epoch_test_loss, epoch_test_score = validation(\n cnn3d, device, optimizer, valid_loader)\n\n # save results\n epoch_train_losses.append(np.mean(train_losses))\n epoch_train_scores.append(np.mean(train_scores))\n # For validation set, it is already averaged\n epoch_test_losses.append(epoch_test_loss)\n epoch_test_scores.append(epoch_test_score)\n", "project_metadata": {"full_name": "chinmay5/FakeDetection", "description": "Fake video detection code for both DeepFake and F2F dataset", "topics": [], "git_url": "git://github.com/chinmay5/FakeDetection.git", "stars": 5, "watchers": 5, "forks": 6, "created": "2019-02-06T19:06:40Z", "size": 705, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2890902, "Python": 20520}, "last_updated": "2020-09-18T16:30:18Z"}, "intent": " # save all train test results"}, {"original_comment": " # Extract tweets from json wrapper\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Methods for Computational Politics\n\n# ## Introduction\n# The 21st century has brought with it a wealth of new methods for data collection and analysis. Due to the convergence of digital trace data availability, more transparent data and code sharing norms, and relatively cheap and plentiful computer processing capabilities, researchers with a laptop and an internet connection can now access a large and growing set of tools and methods. The implications of the \"computational revolution\" is profound for the study of social and political processes, but the skills required to collect and harness these new sources of data are typically not taught to social scientists. This code is for anyone who is interested in studying phenomena that are of social and political importance using the growing set of computational methods that are being used to understand these phenomena in new ways. While these processes are reasonably accessible, it is also critical to understand their limits especially in an adversarial problem space such as bot-detection.\n\n# ## Part 1: Authentication, Tweet Extraction, and Bot Detection\n# The first part of the code examines how researchers can extract data from Twitter, a popular online news and social networking platform. While Twitter provides more access to its data than many other platforms, extracting data from Twitter's servers requires using its Application Programming Interface (or API). This part of the code will cover useful concepts when working with Twitter's API, including rate limiting and error handling, as well as methods for detecting whether Twitter users are humans or automated accounts (i.e. bots). This code is designed to run on Google's Colab for ease of access but you could just as easily run it locally on Jupyter notebooks.\n\n# Emmi Bevensee
\n# University of Arizona
\n# Email: emmibevensee@email.arizona.edu
\n\n# ## 1. Importing libraries and authenticating Twitter\n\n#%%\n\n# Install necesarry packages\n# !pip install tweepy\n# !pip install seaborn\nfrom __future__ import print_function, unicode_literals\nimport traceback\nfrom google.colab import drive\nimport re\nimport pickle\nimport csv\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport requests\nimport pandas as pd\nimport os\nimport time\nimport json\nimport botometer\nimport tweepy\nget_ipython().system('pip install botometer')\n\n#%%\n\n# Import the libraries we need\n\n#%%\n\n# Check working directory\nos.getcwd()\n\n#%%\n\n# get rid of this if you're running locally\ndrive.mount('/content/drive')\n\n#%%\n\n# Change working directory to where you want it to be\nos.chdir('change to your correct path')\n\n# Check working directory\nos.getcwd()\n\n#%%\n\n# get ya keys in order!\nconsumer_key = 'PUT YOUR KEY HERE'\nconsumer_secret = 'PUT YOUR KEY HERE'\naccess_token = 'PUT YOUR KEY HERE'\naccess_token_secret = 'PUT YOUR KEY HERE'\n\n# https://rapidapi.com/OSoMe/api/botometer\nmashape_key = 'PUT YOUR KEY HERE'\n\n\n# Authenticate\nauth = tweepy.OAuthHandler(consumer_key, consumer_secret)\nauth.set_access_token(access_token, access_token_secret)\n\napi = tweepy.API(auth, wait_on_rate_limit=True,\n wait_on_rate_limit_notify=True) # added these waiting calls\n\n#%%\n\n# sets a label for what the large election we are breaking down is\nmajor_election = 'EU_parliament'\n\n#%%\n\ndef botdetectfunction(election, leaning, hashtags):\n \"\"\"## 2. Pulling relevant tweets\"\"\"\n\n # Keep track of API calls\n # For more information, visit https://developer.twitter.com/en/docs/basics/rate-limits.html\n calls = 0\n\n # Pulling most recent tweets (up to 100)\n tweets_raw = api.search(hashtags, count=100, result_type='recent')\n calls += 1", "target_code": " tweets = [t._json for t in tweets_raw]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Methods for Computational Politics\n\n# ## Introduction\n# The 21st century has brought with it a wealth of new methods for data collection and analysis. Due to the convergence of digital trace data availability, more transparent data and code sharing norms, and relatively cheap and plentiful computer processing capabilities, researchers with a laptop and an internet connection can now access a large and growing set of tools and methods. The implications of the \"computational revolution\" is profound for the study of social and political processes, but the skills required to collect and harness these new sources of data are typically not taught to social scientists. This code is for anyone who is interested in studying phenomena that are of social and political importance using the growing set of computational methods that are being used to understand these phenomena in new ways. While these processes are reasonably accessible, it is also critical to understand their limits especially in an adversarial problem space such as bot-detection.\n\n# ## Part 1: Authentication, Tweet Extraction, and Bot Detection\n# The first part of the code examines how researchers can extract data from Twitter, a popular online news and social networking platform. While Twitter provides more access to its data than many other platforms, extracting data from Twitter's servers requires using its Application Programming Interface (or API). This part of the code will cover useful concepts when working with Twitter's API, including rate limiting and error handling, as well as methods for detecting whether Twitter users are humans or automated accounts (i.e. bots). This code is designed to run on Google's Colab for ease of access but you could just as easily run it locally on Jupyter notebooks.\n\n# Emmi Bevensee
\n# University of Arizona
\n# Email: emmibevensee@email.arizona.edu
\n\n# ## 1. Importing libraries and authenticating Twitter\n\n\n# Install necesarry packages\n# !pip install tweepy\n# !pip install seaborn\nfrom __future__ import print_function, unicode_literals\nimport traceback\nfrom google.colab import drive\nimport re\nimport pickle\nimport csv\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport requests\nimport pandas as pd\nimport os\nimport time\nimport json\nimport botometer\nimport tweepy\nget_ipython().system('pip install botometer')\n\n\n# Import the libraries we need\n\n\n# Check working directory\nos.getcwd()\n\n\n# get rid of this if you're running locally\ndrive.mount('/content/drive')\n\n\n# Change working directory to where you want it to be\nos.chdir('change to your correct path')\n\n# Check working directory\nos.getcwd()\n\n\n# get ya keys in order!\nconsumer_key = 'PUT YOUR KEY HERE'\nconsumer_secret = 'PUT YOUR KEY HERE'\naccess_token = 'PUT YOUR KEY HERE'\naccess_token_secret = 'PUT YOUR KEY HERE'\n\n# https://rapidapi.com/OSoMe/api/botometer\nmashape_key = 'PUT YOUR KEY HERE'\n\n\n# Authenticate\nauth = tweepy.OAuthHandler(consumer_key, consumer_secret)\nauth.set_access_token(access_token, access_token_secret)\n\napi = tweepy.API(auth, wait_on_rate_limit=True,\n wait_on_rate_limit_notify=True) # added these waiting calls\n\n\n# sets a label for what the large election we are breaking down is\nmajor_election = 'EU_parliament'\n\n\ndef botdetectfunction(election, leaning, hashtags):\n \"\"\"## 2. Pulling relevant tweets\"\"\"\n\n # Keep track of API calls\n # For more information, visit https://developer.twitter.com/en/docs/basics/rate-limits.html\n calls = 0\n\n # Pulling most recent tweets (up to 100)\n tweets_raw = api.search(hashtags, count=100, result_type='recent')\n calls += 1\n", "project_metadata": {"full_name": "EmmiBevensee/Elections_Bot_Detection_pipeline", "description": "This code was designed to look at bot frequencies in the EU Parliament elections of 2019 and compare between nations and political groupings.", "topics": [], "git_url": "git://github.com/EmmiBevensee/Elections_Bot_Detection_pipeline.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2019-05-16T00:03:56Z", "size": 653, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1124929}, "last_updated": "2019-11-14T15:44:34Z"}, "intent": " # Extract tweets from json wrapper"}, {"original_comment": "# we add noise to the checkerboard for figure 3b\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Example showing how mix python and madagascar processing #\n# ## Notebook creates synthetic for TLE \"Edge-preserving smoothing ... by Yi Luo et. al. ##\n# Reproduced by Luke Decker and Karl Schleicher\n\n#%%\n\n# basic imports.\nimport matplotlib.pyplot as plt\nimport os\nimport numpy as np\nimport m8r\n\n# Use the 'nbagg' backend to render and you can zoom and pan plots.\nbackend = 'nbagg'\nbackend = 'inline'\nif backend == 'nbagg':\n import matplotlib\n matplotlib.use('nbagg') # Use the 'nbagg' backend\nelse:\n # magic command to render inline\n get_ipython().run_line_magic('matplotlib', 'inline')\n\n# import matplotlib.pyplot must be after matplotlib.use('nbagg')\n\n#%%\n\n# create a numpy ndarray (a 2D array) of all floating point zeros\na = np.zeros((61, 72), dtype='float32')\n# Make a section with two faulsted horizons. Sample rate is 4 ms.\n# Data will be high cut filterred at 62.5 hz.\n# Make two horizons. fault at center has throw of two samples.\n# On left side of section horizons are at samples 24 and 28.\n# On right side of section horizons are at samples 22 and 46.\na[0:30, 24] = 1.0\na[30:61, 20] = 1.0\na[0:30, 48] = 1.0\na[30:61, 44] = 1.0\n\n#%%\n\n# use high cut filter to ideal model to create synthetic seismic for Figure 1\ndeltat = .004\nitrace = 0\ntrace = a[0]\nfrequencies = np.fft.rfftfreq(trace.size, d=deltat)\n\n# apply high cut filter using numpy library\nramp_freq = np.array([.0, 55., 70., 125.])\nramp_amp = np.array([1.0, 1.0, .0, .0])\nramp = np.square(\n np.cos(np.pi/2.0*(1.0-np.interp(frequencies, ramp_freq, ramp_amp))))\n#fig, ax = plt.subplots(figsize=(15, 2))\n# plt.plot(frequencies,ramp)\n# plt.show()\n\na_filterred = np.zeros((61, 72), dtype='float32')\nfor itrace in range(a.shape[0]):\n trace = a[itrace]\n TRACE = np.fft.rfft(trace)\n TRACE_FILTERRED = ramp*TRACE\n trace_filterred = np.fft.irfft(TRACE_FILTERRED)\n # multiple by 2 to make peak of wavelet 1.0 instaed of .5\n a_filterred[itrace, :] = 2.0*trace_filterred\n\n#ax.set_xlabel('Time in Seconds')\n# ax.set_ylabel('Amplitude')\nfig = plt.figure(figsize=(7, 7))\nplt.title('Figure 1')\nplt.imshow(a_filterred.T)\nplt.show()\n\n#%%\n\n# define function for mean smoothing, works on 1D arrays, nfilt is smoothing radius\n# number of points in filter is 2*nfilt+1\ndef mean_smooth(array, nfilt):\n # determine array size\n n = array.shape[0]\n # decleare mean array\n mean = np.zeros(n, dtype='float32')\n for i in range(n):\n # create temporary array with date, making sure to not go out of bounds\n temp = array[max(0, i-nfilt):min(n-1, i+nfilt)]\n # determine the average\n mean[i] = np.average(temp)\n return mean\n\n#%%\n\n# define function for edge preserving smoothing, works on 1D arrays, nfilt is smoothing radius\n# number of points in filter is 2*nfilt+1\ndef edge_smooth(array, nfilt,):\n # determine array size\n n = array.shape[0]\n # decleare output array, using float32 for Madagascar compatibility\n edge = np.zeros(n, dtype='float32')\n # declare variance array\n var = np.zeros(n, dtype='float32')\n # declare array for storing mean\n mean = np.zeros(n, dtype='float32')\n # loop through data\n for i in range(n):\n # generate temporary array to hold local window\n temp = array[max(0, i-nfilt):min(n, i+nfilt)]\n # calculate variance\n var[i] = np.var(temp)\n # and the mean\n mean[i] = np.average(temp)\n # now we loop through again, selecting the minimum variance window\n for i in range(n):\n edge[i] = mean[np.argmin(\n var[max(0, i-nfilt):min(n, i+nfilt)]) + i - nfilt - min(0, i-nfilt)]\n return edge\n\n#%%\n\n# now we recreate figure 2\nfig = plt.figure(figsize=(7, 7))\n\nax1 = plt.subplot2grid((2, 2), (0, 0),)\nax2 = plt.subplot2grid((2, 2), (1, 0),)\nax3 = plt.subplot2grid((2, 2), (0, 1),)\nax4 = plt.subplot2grid((2, 2), (1, 1),)\n# figure 2a\nax1.plot(a_filterred[:, 20])\nax1.set_xlabel(\"Noise-Free Signal\")\nax1.set_ylim([-0.5, 1.5])\nax1.set_ylabel(\"Function Amplitude\")\n# figure 2b\na_noise = a_filterred+(np.random.rand(a.shape[0], a.shape[1])-.5)\nax2.plot(a_noise[:, 20])\nax2.set_xlabel(\"Noise-Added Signal\")\nax2.set_ylim([-0.5, 1.5])\nax2.set_ylabel(\"Function Amplitude\")\n# figure 2c\na_mean = mean_smooth(a_noise[:, 20], 10)\nax3.plot(a_mean)\nax3.set_xlabel(\"After Regular Smoothing\")\nax3.set_ylim([-0.5, 1.5])\nax3.set_ylabel(\"Function Amplitude\")\n# figure 2d\na_edge = edge_smooth(a_noise[:, 20], 10)\nax4.plot(a_edge)\nax4.set_xlabel(\"After Edge-Preserving Smoothing\")\nax4.set_ylim([-0.5, 1.5])\nax4.set_ylabel(\"Function Amplitude\")\n# ensure that labels don't overlap figures\nplt.tight_layout()\nplt.rcParams[\"figure.figsize\"] = 3, 3\nplt.show()\n\n\n# now that we have played around with the 1 dimensional reproduction and gotten our hands a little bit dirty, we may move onto 2 dimensions!\n\n#%%\n\n# lets define a function that creates checkerboards for us\n# creates checkerboards on n2 x n1 grid, with square length d2 x d1, initial offsets o2, o1.\ndef checkerboard(n2, n1, d2, d1, o2, o1):\n # initialize array\n board = np.zeros((n2, n1), dtype='float32')\n # loop through\n for j in range(n2):\n y = (j-o2)/d2+1\n for i in range(n1):\n x = (i-o1)/d1+1\n xy = x+y\n # why does this work? why doesn't this always return 0?\n board[j, i] = (xy-xy/2-xy/2)\n return board\n\n#%%\n\n# define function for mean smoothing\ndef mean_smooth_2d(array, nfilt2, nfilt1, flag=1):\n # flag is a parameter related to the smoothing stencil.\n # flag=1 corresponds to cell centered, odd number stencils.\n # flag=0 corresponds to skewed, even number stencils. This was added so\n # we may properly reproduce the checkerboard figure, which appears to have\n # been completed using a 4x4 skew stencil\n # determine array dimensions, python is (n2,n1)\n n2 = np.shape(array)[0]\n n1 = np.shape(array)[1]\n # initialize the output array\n mean = np.zeros(np.shape(array), dtype='float32')\n # loop through elements\n for j in range(n2):\n for i in range(n1):\n # window out the local array\n temp = array[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # calculate mean\n mean[j, i] = np.average(temp)\n return mean\n\n#%%\n\n# define function for edge preserving smoothing\n# how is this different than the 1d case in terms of implementation and how dimensions are dealt with?\ndef edge_smooth_2d(array, nfilt2, nfilt1, flag=1):\n # flag is a parameter related to the smoothing stencil.\n # flag=1 corresponds to cell centered, odd number stencils.\n # flag=0 corresponds to skewed, even number stencils. This was added so\n # we may properly reproduce the checkerboard figure, which appears to have\n # been completed using a 4x4 skew stencil\n # get dimensions\n n2 = np.shape(array)[0]\n n1 = np.shape(array)[1]\n # array for holding variances\n var = np.zeros(np.shape(array), dtype='float32')\n # array for holding means\n mean = np.zeros(np.shape(array), dtype='float32')\n # loop through indxes and calculate variance for each window\n for j in range(n2):\n for i in range(n1):\n # window out the local array\n temp = array[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # calculate variance\n var[j, i] = np.var(temp)\n # calculate mean\n mean[j, i] = np.mean(temp)\n # now we again loop through indexes to determine which window within the\n # acceptable range of each index has the lowest variance\n edge = np.zeros(np.shape(array), dtype='float32')\n for j in range(n2):\n for i in range(n1):\n temp = var[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # determine index with minimal variance\n mindex = np.unravel_index(temp.argmin(), np.shape(temp))\n # transform this local index into an index in the global array\n w1 = mindex[1] - nfilt1 + max(0, nfilt1-i) + i\n w2 = mindex[0] - nfilt2 + max(0, nfilt2-j) + j\n # write mean of this variance-minimizing window to array\n edge[j, i] = mean[w2, w1]\n return edge\n\n\n# Here we attempt to recreate the checkerboard figure using a cell centered 5x5 stencil.\n# However, with the 4x4 checkerboards this doesn't lead to the results we want!\n# Please see below.\n\n#%%\n\n# lets generate the checkerboard for figure 3a using the program we just wrote\nboard = checkerboard(52, 52, 4, 4, 0, 0)\n# we add noise to the checkerboard for figure 3b\nboard_noise = board + (np.random.rand(board.shape[0], board.shape[1])-.5)\n# and then do 5x5 mean value smoothing (2x2 smoothing radius) for figure 3c\nboard_mean = mean_smooth_2d(board_noise, 2, 2)\n# as well as 5x5 edge-preserving smoothing (2x2 smoothing radius) for figure 3d\nboard_edge = edge_smooth_2d(board_noise, 2, 2)\n\n# and plot the results to generate Figure 3\nfig = plt.figure(figsize=(7, 7))\n\nax1 = plt.subplot2grid((2, 2), (0, 0),)\nax2 = plt.subplot2grid((2, 2), (0, 1),)\nax3 = plt.subplot2grid((2, 2), (1, 0),)\nax4 = plt.subplot2grid((2, 2), (1, 1),)\n\n# figure 3a\nax1.imshow(board)\nax1.set_xlabel(\"X Index\")\nax1.set_ylabel(\"Y Index\")\n# figure 3b\nax2.imshow(board_noise)\nax2.set_xlabel(\"X Index\")\nax2.set_ylabel(\"Y Index\")\n# figure 3c\nax3.imshow(board_mean)\nax3.set_xlabel(\"X Index\")\nax3.set_ylabel(\"Y Index\")\n# figure 3d\nax4.imshow(board_edge)\nax4.set_xlabel(\"X Index\")\nax4.set_ylabel(\"Y Index\")\n\n# ensure that labels don't overlap figures\nplt.tight_layout()\nplt.show()\n\n\n# So now we recreate the experiment using skewed 4x4 stencils.\n# This corresponds to setting flag=0 in the function calls.\n\n#%%\n\n# but these dont look right! so we adjust our programs using the \"flag\" variable to only use a 4x4 stencil\n# lets generate the checkerboard for figure 3a using the program we just wrote\nboard = checkerboard(52, 52, 4, 4, 0, 0)", "target_code": "board_noise = board + (np.random.rand(board.shape[0], board.shape[1])-.5)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Example showing how mix python and madagascar processing #\n# ## Notebook creates synthetic for TLE \"Edge-preserving smoothing ... by Yi Luo et. al. ##\n# Reproduced by Luke Decker and Karl Schleicher\n\n\n# basic imports.\nimport matplotlib.pyplot as plt\nimport os\nimport numpy as np\nimport m8r\n\n# Use the 'nbagg' backend to render and you can zoom and pan plots.\nbackend = 'nbagg'\nbackend = 'inline'\nif backend == 'nbagg':\n import matplotlib\n matplotlib.use('nbagg') # Use the 'nbagg' backend\nelse:\n # magic command to render inline\n get_ipython().run_line_magic('matplotlib', 'inline')\n\n# import matplotlib.pyplot must be after matplotlib.use('nbagg')\n\n\n# create a numpy ndarray (a 2D array) of all floating point zeros\na = np.zeros((61, 72), dtype='float32')\n# Make a section with two faulsted horizons. Sample rate is 4 ms.\n# Data will be high cut filterred at 62.5 hz.\n# Make two horizons. fault at center has throw of two samples.\n# On left side of section horizons are at samples 24 and 28.\n# On right side of section horizons are at samples 22 and 46.\na[0:30, 24] = 1.0\na[30:61, 20] = 1.0\na[0:30, 48] = 1.0\na[30:61, 44] = 1.0\n\n\n# use high cut filter to ideal model to create synthetic seismic for Figure 1\ndeltat = .004\nitrace = 0\ntrace = a[0]\nfrequencies = np.fft.rfftfreq(trace.size, d=deltat)\n\n# apply high cut filter using numpy library\nramp_freq = np.array([.0, 55., 70., 125.])\nramp_amp = np.array([1.0, 1.0, .0, .0])\nramp = np.square(\n np.cos(np.pi/2.0*(1.0-np.interp(frequencies, ramp_freq, ramp_amp))))\n#fig, ax = plt.subplots(figsize=(15, 2))\n# plt.plot(frequencies,ramp)\n# plt.show()\n\na_filterred = np.zeros((61, 72), dtype='float32')\nfor itrace in range(a.shape[0]):\n trace = a[itrace]\n TRACE = np.fft.rfft(trace)\n TRACE_FILTERRED = ramp*TRACE\n trace_filterred = np.fft.irfft(TRACE_FILTERRED)\n # multiple by 2 to make peak of wavelet 1.0 instaed of .5\n a_filterred[itrace, :] = 2.0*trace_filterred\n\n#ax.set_xlabel('Time in Seconds')\n# ax.set_ylabel('Amplitude')\nfig = plt.figure(figsize=(7, 7))\nplt.title('Figure 1')\nplt.imshow(a_filterred.T)\nplt.show()\n\n\n# define function for mean smoothing, works on 1D arrays, nfilt is smoothing radius\n# number of points in filter is 2*nfilt+1\ndef mean_smooth(array, nfilt):\n # determine array size\n n = array.shape[0]\n # decleare mean array\n mean = np.zeros(n, dtype='float32')\n for i in range(n):\n # create temporary array with date, making sure to not go out of bounds\n temp = array[max(0, i-nfilt):min(n-1, i+nfilt)]\n # determine the average\n mean[i] = np.average(temp)\n return mean\n\n\n# define function for edge preserving smoothing, works on 1D arrays, nfilt is smoothing radius\n# number of points in filter is 2*nfilt+1\ndef edge_smooth(array, nfilt,):\n # determine array size\n n = array.shape[0]\n # decleare output array, using float32 for Madagascar compatibility\n edge = np.zeros(n, dtype='float32')\n # declare variance array\n var = np.zeros(n, dtype='float32')\n # declare array for storing mean\n mean = np.zeros(n, dtype='float32')\n # loop through data\n for i in range(n):\n # generate temporary array to hold local window\n temp = array[max(0, i-nfilt):min(n, i+nfilt)]\n # calculate variance\n var[i] = np.var(temp)\n # and the mean\n mean[i] = np.average(temp)\n # now we loop through again, selecting the minimum variance window\n for i in range(n):\n edge[i] = mean[np.argmin(\n var[max(0, i-nfilt):min(n, i+nfilt)]) + i - nfilt - min(0, i-nfilt)]\n return edge\n\n\n# now we recreate figure 2\nfig = plt.figure(figsize=(7, 7))\n\nax1 = plt.subplot2grid((2, 2), (0, 0),)\nax2 = plt.subplot2grid((2, 2), (1, 0),)\nax3 = plt.subplot2grid((2, 2), (0, 1),)\nax4 = plt.subplot2grid((2, 2), (1, 1),)\n# figure 2a\nax1.plot(a_filterred[:, 20])\nax1.set_xlabel(\"Noise-Free Signal\")\nax1.set_ylim([-0.5, 1.5])\nax1.set_ylabel(\"Function Amplitude\")\n# figure 2b\na_noise = a_filterred+(np.random.rand(a.shape[0], a.shape[1])-.5)\nax2.plot(a_noise[:, 20])\nax2.set_xlabel(\"Noise-Added Signal\")\nax2.set_ylim([-0.5, 1.5])\nax2.set_ylabel(\"Function Amplitude\")\n# figure 2c\na_mean = mean_smooth(a_noise[:, 20], 10)\nax3.plot(a_mean)\nax3.set_xlabel(\"After Regular Smoothing\")\nax3.set_ylim([-0.5, 1.5])\nax3.set_ylabel(\"Function Amplitude\")\n# figure 2d\na_edge = edge_smooth(a_noise[:, 20], 10)\nax4.plot(a_edge)\nax4.set_xlabel(\"After Edge-Preserving Smoothing\")\nax4.set_ylim([-0.5, 1.5])\nax4.set_ylabel(\"Function Amplitude\")\n# ensure that labels don't overlap figures\nplt.tight_layout()\nplt.rcParams[\"figure.figsize\"] = 3, 3\nplt.show()\n\n\n# now that we have played around with the 1 dimensional reproduction and gotten our hands a little bit dirty, we may move onto 2 dimensions!\n\n\n# lets define a function that creates checkerboards for us\n# creates checkerboards on n2 x n1 grid, with square length d2 x d1, initial offsets o2, o1.\ndef checkerboard(n2, n1, d2, d1, o2, o1):\n # initialize array\n board = np.zeros((n2, n1), dtype='float32')\n # loop through\n for j in range(n2):\n y = (j-o2)/d2+1\n for i in range(n1):\n x = (i-o1)/d1+1\n xy = x+y\n # why does this work? why doesn't this always return 0?\n board[j, i] = (xy-xy/2-xy/2)\n return board\n\n\n# define function for mean smoothing\ndef mean_smooth_2d(array, nfilt2, nfilt1, flag=1):\n # flag is a parameter related to the smoothing stencil.\n # flag=1 corresponds to cell centered, odd number stencils.\n # flag=0 corresponds to skewed, even number stencils. This was added so\n # we may properly reproduce the checkerboard figure, which appears to have\n # been completed using a 4x4 skew stencil\n # determine array dimensions, python is (n2,n1)\n n2 = np.shape(array)[0]\n n1 = np.shape(array)[1]\n # initialize the output array\n mean = np.zeros(np.shape(array), dtype='float32')\n # loop through elements\n for j in range(n2):\n for i in range(n1):\n # window out the local array\n temp = array[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # calculate mean\n mean[j, i] = np.average(temp)\n return mean\n\n\n# define function for edge preserving smoothing\n# how is this different than the 1d case in terms of implementation and how dimensions are dealt with?\ndef edge_smooth_2d(array, nfilt2, nfilt1, flag=1):\n # flag is a parameter related to the smoothing stencil.\n # flag=1 corresponds to cell centered, odd number stencils.\n # flag=0 corresponds to skewed, even number stencils. This was added so\n # we may properly reproduce the checkerboard figure, which appears to have\n # been completed using a 4x4 skew stencil\n # get dimensions\n n2 = np.shape(array)[0]\n n1 = np.shape(array)[1]\n # array for holding variances\n var = np.zeros(np.shape(array), dtype='float32')\n # array for holding means\n mean = np.zeros(np.shape(array), dtype='float32')\n # loop through indxes and calculate variance for each window\n for j in range(n2):\n for i in range(n1):\n # window out the local array\n temp = array[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # calculate variance\n var[j, i] = np.var(temp)\n # calculate mean\n mean[j, i] = np.mean(temp)\n # now we again loop through indexes to determine which window within the\n # acceptable range of each index has the lowest variance\n edge = np.zeros(np.shape(array), dtype='float32')\n for j in range(n2):\n for i in range(n1):\n temp = var[max(0, j-nfilt2):min(n2-1, j+nfilt2+flag),\n max(0, i-nfilt1):min(n1-1, i+nfilt1+flag)]\n # determine index with minimal variance\n mindex = np.unravel_index(temp.argmin(), np.shape(temp))\n # transform this local index into an index in the global array\n w1 = mindex[1] - nfilt1 + max(0, nfilt1-i) + i\n w2 = mindex[0] - nfilt2 + max(0, nfilt2-j) + j\n # write mean of this variance-minimizing window to array\n edge[j, i] = mean[w2, w1]\n return edge\n\n\n# Here we attempt to recreate the checkerboard figure using a cell centered 5x5 stencil.\n# However, with the 4x4 checkerboards this doesn't lead to the results we want!\n# Please see below.\n\n\n# lets generate the checkerboard for figure 3a using the program we just wrote\nboard = checkerboard(52, 52, 4, 4, 0, 0)\n# we add noise to the checkerboard for figure 3b\nboard_noise = board + (np.random.rand(board.shape[0], board.shape[1])-.5)\n# and then do 5x5 mean value smoothing (2x2 smoothing radius) for figure 3c\nboard_mean = mean_smooth_2d(board_noise, 2, 2)\n# as well as 5x5 edge-preserving smoothing (2x2 smoothing radius) for figure 3d\nboard_edge = edge_smooth_2d(board_noise, 2, 2)\n\n# and plot the results to generate Figure 3\nfig = plt.figure(figsize=(7, 7))\n\nax1 = plt.subplot2grid((2, 2), (0, 0),)\nax2 = plt.subplot2grid((2, 2), (0, 1),)\nax3 = plt.subplot2grid((2, 2), (1, 0),)\nax4 = plt.subplot2grid((2, 2), (1, 1),)\n\n# figure 3a\nax1.imshow(board)\nax1.set_xlabel(\"X Index\")\nax1.set_ylabel(\"Y Index\")\n# figure 3b\nax2.imshow(board_noise)\nax2.set_xlabel(\"X Index\")\nax2.set_ylabel(\"Y Index\")\n# figure 3c\nax3.imshow(board_mean)\nax3.set_xlabel(\"X Index\")\nax3.set_ylabel(\"Y Index\")\n# figure 3d\nax4.imshow(board_edge)\nax4.set_xlabel(\"X Index\")\nax4.set_ylabel(\"Y Index\")\n\n# ensure that labels don't overlap figures\nplt.tight_layout()\nplt.show()\n\n\n# So now we recreate the experiment using skewed 4x4 stencils.\n# This corresponds to setting flag=0 in the function calls.\n\n\n# but these dont look right! so we adjust our programs using the \"flag\" variable to only use a 4x4 stencil\n# lets generate the checkerboard for figure 3a using the program we just wrote\nboard = checkerboard(52, 52, 4, 4, 0, 0)\n", "project_metadata": {"full_name": "seg/repro-zoo-2018", "description": "Reproduced papers from the Reproducibility Zoo", "topics": [], "git_url": "git://github.com/seg/repro-zoo-2018.git", "stars": 21, "watchers": 21, "forks": 13, "created": "2018-10-09T22:17:57Z", "size": 13252, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5700597, "Lasso": 5439724, "Python": 251464, "C": 9691, "Shell": 304}, "last_updated": "2020-09-07T02:21:27Z"}, "intent": "# we add noise to the checkerboard for figure 3b"}, {"original_comment": "# plot the results using a line plot\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Analysis of the German league (Bundesliga)\n# This article is a journey through the history of the Bundesliga. Analyzing historical data (all classifications from 1963 until 2020), we will be able to answer many questions about the German league. What teams won the German league? What teams nearly won the Bundesliga? When did Bayern's hegemony start? What teams receive more penalties?\u00a0\u2026 and many more! Continue reading\n\n# # Introduction\n# Let's make a brief introduction for those that have never heard about the German league. \ud83d\ude4c\n#\n# The German football league commonly known as the Bundesliga is the first national football league in Germany, being one of the most popular professional sports leagues across the world. It was founded in 1963 after the unification of five regional leagues from West Germany and consisted initially of 16 teams.\n#\n# At the end of a match, the winning team is rewarded with three points (before the season 1995\u201396 with 2 points) and the losing team with zero. In case of a tie, both teams are rewarded with 1 point.\n#\n# In many European leagues, the bottom three teams are automatically relegated to the second division. On the contrary, in the Bundesliga, only the bottom two are directly relegated to the 2 Bundesliga. The 16th team in the Bundesliga and the 3th in the 2 Bundesliga contest a two-legged play-off for a place in the first division.\n#\n# The introduction is made! Now, we are ready to analyze the data \u2764\ufe0f\n\n# # Web data extraction\n# The historical data of the Bundesliga (from 1963 until 2020) was scraped from https://www.bdfutbol.com/. This website contains football rankings of the best European leagues.\n#\n# To scrape the data, we have used BeautifulSoup which is a popular Python library for extracting information from an HTML page. After obtaining all the data, we have stored it in a Pandas data frame for further processing.\n\n#%%\n\nimport chart_studio\nimport plotly.graph_objects as go\nimport chart_studio.plotly as py\nimport plotly\nimport time\n\nimport requests\nimport bs4\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nplt.style.use('seaborn')\n\n# the web page bdfutbol contains historical data about multiple european leagues - we have selected the german league\nweb_page = 'https://www.bdfutbol.com/es/t/'\nleague = 't.html#ger'\n\n# obtain the html code as a string\nresponse = requests.get(web_page + league)\nhtml = response.text\n\n# create a BeautifulSoup object\nsoup = bs4.BeautifulSoup(html, \"html.parser\")\ndf_scores = pd.DataFrame()\n\n# loop through the anchor tags\nfor anchor in soup.find_all(class_=\"bloc_temporades\")[4].find_all('a'):\n\n # get the hypertext reference and the text of the anchor tag\n page = anchor.get('href')\n season = anchor.text\n\n # obtain the html code as a string\n response = requests.get(web_page + page)\n html = response.text\n\n # create a BeautifulSoup object\n soup = bs4.BeautifulSoup(html, \"html.parser\")\n\n # obtain the table containing the annual classification\n table = soup.find(class_=\"taula_estil sortable\")\n df_league = pd.read_html(str(table))[0]\n\n # add the season - anchor text and append the data frame to df_scores\n df_league['season'] = season\n df_scores = df_scores.append(df_league)\n\n time.sleep(0.2)\n\n# visualize the first five rows of the scraped data frame\ndf_scores.head()\n\n\n# # Data Cleaning\n# Data Cleaning is the process of transforming raw data into a standardized form that can easily be analyzed with data analytics tools. In this particular case, before analyzing the data using Pandas, we perform a few cleaning operations. First, we remove unnecessary columns and rename the remaining ones using English terms (remember that the data was scraped from a Spanish website). Then, we modify the wrong data types. The column points (points obtained by a team during a particular season) is of data type object instead of integer due to the presence of asterisks. These asterisks are used to refer to explanations at the bottom of the web page and they are not relevant for this analysis. In fact, the data type is not imported correctly because of the existence of these asterisks in some entries of the column. Therefore, we have to remove them, before converting the column points to an integer data type.\n\n# ### Drop unnecessary columns\n\n#%%\n\n# drop the columns Unnamed:0 and Unnamed:2 - they do not contain valuable information\ndf_scores.drop(columns=['Unnamed: 0', 'Unnamed: 2'], inplace=True)\n\n#%%\n\n# check that the modification has been carried out properly\ndf_scores.columns\n\n\n# ### Rename the columns using English terms\n\n#%%\n\n# rename the columns\ndf_scores.rename({'Unnamed: 1': 'position', 'Unnamed: 3': 'club', 'Puntos': 'points', 'PJ': 'played', 'PG': 'won',\n 'PE': 'drawn', 'PP': 'lost', 'GF': 'goals_for', 'GC': 'goals_against', 'TA': 'yellow_card',\n 'TR': 'red_card'}, axis=1, inplace=True)\n\n#%%\n\n# check that the modification has been carried out properly\ndf_scores.columns\n\n\n# ### Modify incorrect data types\n\n#%%\n\n# in some cases the entries in the column (points) contain additional symbols\ndf_scores.points = df_scores.points.astype(str)\n\n# remove the symbols extracting only the digits\ndf_scores.points = df_scores.points.str.extract('(\\d+)')\n\n#%%\n\n# check that the modification has been carried out properly\ndf_scores.points.unique()\n\n#%%\n\n# convert columns (points) into int data type\ndf_scores.points = df_scores.points.astype(int)\n\n\n# After cleaning the data, we obtain a Pandas data frame that can be easily processed to extract conclusions. As shown below, the data frame contains information such as the number of games won, drawn, and lost, the number of yellow and red cards, the number of points, and the position in the ranking of all teams that took part in the Bundesliga from 1963 until 2020.\n\n#%%\n\ndf_scores.head()\n\n\n# # German league\u00a0winners\n# The Bundesliga has been played by 57 different clubs during its 57 years of existence (up to the season 2019\u201320); however, only twelve of them got their hands on the trophy. The following plot shows the German league winners from season 1963\u201364 until 2019\u201320.\n\n#%%\n\n# teams that won the Bundesliga\nfirst_position = df_scores[df_scores['position'] == 1].club.value_counts()\n\n# plot labels indicating the number of leagues\nfor i, value in enumerate(first_position):\n plt.text(value, i, str(value), horizontalalignment='right',\n verticalalignment='center', weight='bold', color='white', fontsize=14)\n\n# plot the results using an horizontal bar plot\nfirst_position.plot(kind='barh')\n\n# ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# labels and title\nplt.xlabel('Number of leagues', fontsize=14)\nplt.ylabel('Football teams', fontsize=14)\nplt.title('Bundesliga winners', fontsize=20)\n\n\n# As shown above, Bayern M\u00fcnchen is the most successful club in the history of the Bundesliga with 29 titles, which represents more than 50% of the leagues. The next most successful teams are Borussia M\u00f6nchengladbach and Borussia Dortmund which has won the Bundesliga five times. Apart from them, other teams such as Werder Bremen, Hamburger, Stuttgart, K\u00f6ln, and Kaiserslautern also had the honor of lifting the Bundesliga trophy multiple times.\n\n#%%\n\n# number of clubs that played in the Bundesliga at least one season\ndf_scores.club.nunique()\n\n#%%\n\n# number of clubs that won the Bundesliga\ndf_scores[df_scores['position'] == 1].club.nunique()\n\n#%%\n\n# number of seasons\ndf_scores.season.nunique()\n\n\n# # German league runner-ups\n# There are 6 football teams that have never won the league but they were on one or more occasions runner-ups: Alemannia Aachen, Bayer Leverkusen, Hertha Berliner, Meidericher, RB Leipzig, and Schalke 04. As shown below, Schalke 04 and Bayer Leverkusen have been particularly unlucky being runner-ups of the Bundesliga 7 and 5 times respectively. Additionally, we can also observe that Bayern Munich is the club that has been on more occasions runner-up of the Bundesliga (10 times).\n\n#%%\n\n# Bundesliga runner-ups\nsecond_position = df_scores[df_scores['position'] == 2].club.value_counts()\n\n# plot labels indicating the number times the team was runner-up\nfor i, value in enumerate(second_position):\n plt.text(value, i, str(value), horizontalalignment='right',\n verticalalignment='center', weight='bold', color='white', fontsize=12)\n\n# plot the results using an horizontal bar plot\nsecond_position.plot(kind='barh', color='maroon')\n\n# ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# labels and title\nplt.xlabel('Number of times', fontsize=14)\nplt.ylabel('Football teams', fontsize=14)\nplt.title('Bundesliga runner-ups', fontsize=20)\n\n\n# In the season 2016\u201317, RB Leipzig finished second in the Bundesliga. The club was just founded in 2009 and is currently one of the leading teams in Germany mainly because of the significant investments made by the company Red Bull.\n\n#%%\n\n# football teams that have never won the league but they were on one or more occasions runner-ups\nfirst_position = set(df_scores[df_scores['position'] == 1].club.unique())\n\nsecond_position = set(df_scores[df_scores['position'] == 2].club.unique())\n\nsecond_position.difference(first_position)\n\n\n# # Total number of seasons in the first\u00a0division (Top 10)\n# Werder Bremen holds the record for having played the most seasons in the German league. They have played the Bundesliga in 56 of its 57 seasons, being relegated to the second division only on one occasion. Bayern M\u00fcnchen has played in the Bundesliga uninterrupted since 1965, and Hamburger from 1963 until 2018, both of them 55 seasons in total. Apart from the aforementioned clubs, Borussia Dortmund, Stuttgart, Borussia M\u00f6nchengladbach, Schalke 04, and Eintracht Frankfurt have also participated in the German league more than 50 seasons.\n\n#%%\n\n# total number of seasons in the first division (top 10)\nnumber_of_seasons = df_scores.groupby('club').count(\n).position.sort_values(ascending=False).head(10)\n\n# plot labels indicating the number of seasons in the first division\nfor i, value in enumerate(number_of_seasons):\n plt.text(i, value, str(value), horizontalalignment='center',\n verticalalignment='top', weight='bold', color='white', fontsize=14)\n\n# plot the results using an horizontal bar plot\nnumber_of_seasons.plot(kind='bar', color='green')\n\n# modify the ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# define the title and the labels\nplt.title('Total number of seasons in the first division', fontsize=20)\nplt.xlabel('Club', fontsize=16)\nplt.ylabel('Number of seasons', fontsize=16)\n\n\n# In the last season (2019\u201320), all the teams from the image above played in the Bundesliga with the exception of Hamburger, Stuttgart, and Kaiserslautern.\n\n# # Number of teams in the Bundesliga per\u00a0season\n# The Bundesliga began with 16 teams in 1963 and it was enlarged to 18 teams in 1965. Since then, the number of clubs in the Bundesliga has remained unchanged with the exception of the season 1991\u201392. In that season, the league was temporarily expanded (20 teams) to accommodate the clubs from former East Germany.\n\n#%%\n\n# number of teams in the Bundesliga per season\nnumber_clubs = df_scores.groupby('season').count().club", "target_code": "number_clubs.plot(linewidth=3, color='g')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Analysis of the German league (Bundesliga)\n# This article is a journey through the history of the Bundesliga. Analyzing historical data (all classifications from 1963 until 2020), we will be able to answer many questions about the German league. What teams won the German league? What teams nearly won the Bundesliga? When did Bayern's hegemony start? What teams receive more penalties?\u00a0\u2026 and many more! Continue reading\n\n# # Introduction\n# Let's make a brief introduction for those that have never heard about the German league. \ud83d\ude4c\n#\n# The German football league commonly known as the Bundesliga is the first national football league in Germany, being one of the most popular professional sports leagues across the world. It was founded in 1963 after the unification of five regional leagues from West Germany and consisted initially of 16 teams.\n#\n# At the end of a match, the winning team is rewarded with three points (before the season 1995\u201396 with 2 points) and the losing team with zero. In case of a tie, both teams are rewarded with 1 point.\n#\n# In many European leagues, the bottom three teams are automatically relegated to the second division. On the contrary, in the Bundesliga, only the bottom two are directly relegated to the 2 Bundesliga. The 16th team in the Bundesliga and the 3th in the 2 Bundesliga contest a two-legged play-off for a place in the first division.\n#\n# The introduction is made! Now, we are ready to analyze the data \u2764\ufe0f\n\n# # Web data extraction\n# The historical data of the Bundesliga (from 1963 until 2020) was scraped from https://www.bdfutbol.com/. This website contains football rankings of the best European leagues.\n#\n# To scrape the data, we have used BeautifulSoup which is a popular Python library for extracting information from an HTML page. After obtaining all the data, we have stored it in a Pandas data frame for further processing.\n\n\nimport chart_studio\nimport plotly.graph_objects as go\nimport chart_studio.plotly as py\nimport plotly\nimport time\n\nimport requests\nimport bs4\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nplt.style.use('seaborn')\n\n# the web page bdfutbol contains historical data about multiple european leagues - we have selected the german league\nweb_page = 'https://www.bdfutbol.com/es/t/'\nleague = 't.html#ger'\n\n# obtain the html code as a string\nresponse = requests.get(web_page + league)\nhtml = response.text\n\n# create a BeautifulSoup object\nsoup = bs4.BeautifulSoup(html, \"html.parser\")\ndf_scores = pd.DataFrame()\n\n# loop through the anchor tags\nfor anchor in soup.find_all(class_=\"bloc_temporades\")[4].find_all('a'):\n\n # get the hypertext reference and the text of the anchor tag\n page = anchor.get('href')\n season = anchor.text\n\n # obtain the html code as a string\n response = requests.get(web_page + page)\n html = response.text\n\n # create a BeautifulSoup object\n soup = bs4.BeautifulSoup(html, \"html.parser\")\n\n # obtain the table containing the annual classification\n table = soup.find(class_=\"taula_estil sortable\")\n df_league = pd.read_html(str(table))[0]\n\n # add the season - anchor text and append the data frame to df_scores\n df_league['season'] = season\n df_scores = df_scores.append(df_league)\n\n time.sleep(0.2)\n\n# visualize the first five rows of the scraped data frame\ndf_scores.head()\n\n\n# # Data Cleaning\n# Data Cleaning is the process of transforming raw data into a standardized form that can easily be analyzed with data analytics tools. In this particular case, before analyzing the data using Pandas, we perform a few cleaning operations. First, we remove unnecessary columns and rename the remaining ones using English terms (remember that the data was scraped from a Spanish website). Then, we modify the wrong data types. The column points (points obtained by a team during a particular season) is of data type object instead of integer due to the presence of asterisks. These asterisks are used to refer to explanations at the bottom of the web page and they are not relevant for this analysis. In fact, the data type is not imported correctly because of the existence of these asterisks in some entries of the column. Therefore, we have to remove them, before converting the column points to an integer data type.\n\n# ### Drop unnecessary columns\n\n\n# drop the columns Unnamed:0 and Unnamed:2 - they do not contain valuable information\ndf_scores.drop(columns=['Unnamed: 0', 'Unnamed: 2'], inplace=True)\n\n\n# check that the modification has been carried out properly\ndf_scores.columns\n\n\n# ### Rename the columns using English terms\n\n\n# rename the columns\ndf_scores.rename({'Unnamed: 1': 'position', 'Unnamed: 3': 'club', 'Puntos': 'points', 'PJ': 'played', 'PG': 'won',\n 'PE': 'drawn', 'PP': 'lost', 'GF': 'goals_for', 'GC': 'goals_against', 'TA': 'yellow_card',\n 'TR': 'red_card'}, axis=1, inplace=True)\n\n\n# check that the modification has been carried out properly\ndf_scores.columns\n\n\n# ### Modify incorrect data types\n\n\n# in some cases the entries in the column (points) contain additional symbols\ndf_scores.points = df_scores.points.astype(str)\n\n# remove the symbols extracting only the digits\ndf_scores.points = df_scores.points.str.extract('(\\d+)')\n\n\n# check that the modification has been carried out properly\ndf_scores.points.unique()\n\n\n# convert columns (points) into int data type\ndf_scores.points = df_scores.points.astype(int)\n\n\n# After cleaning the data, we obtain a Pandas data frame that can be easily processed to extract conclusions. As shown below, the data frame contains information such as the number of games won, drawn, and lost, the number of yellow and red cards, the number of points, and the position in the ranking of all teams that took part in the Bundesliga from 1963 until 2020.\n\n\ndf_scores.head()\n\n\n# # German league\u00a0winners\n# The Bundesliga has been played by 57 different clubs during its 57 years of existence (up to the season 2019\u201320); however, only twelve of them got their hands on the trophy. The following plot shows the German league winners from season 1963\u201364 until 2019\u201320.\n\n\n# teams that won the Bundesliga\nfirst_position = df_scores[df_scores['position'] == 1].club.value_counts()\n\n# plot labels indicating the number of leagues\nfor i, value in enumerate(first_position):\n plt.text(value, i, str(value), horizontalalignment='right',\n verticalalignment='center', weight='bold', color='white', fontsize=14)\n\n# plot the results using an horizontal bar plot\nfirst_position.plot(kind='barh')\n\n# ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# labels and title\nplt.xlabel('Number of leagues', fontsize=14)\nplt.ylabel('Football teams', fontsize=14)\nplt.title('Bundesliga winners', fontsize=20)\n\n\n# As shown above, Bayern M\u00fcnchen is the most successful club in the history of the Bundesliga with 29 titles, which represents more than 50% of the leagues. The next most successful teams are Borussia M\u00f6nchengladbach and Borussia Dortmund which has won the Bundesliga five times. Apart from them, other teams such as Werder Bremen, Hamburger, Stuttgart, K\u00f6ln, and Kaiserslautern also had the honor of lifting the Bundesliga trophy multiple times.\n\n\n# number of clubs that played in the Bundesliga at least one season\ndf_scores.club.nunique()\n\n\n# number of clubs that won the Bundesliga\ndf_scores[df_scores['position'] == 1].club.nunique()\n\n\n# number of seasons\ndf_scores.season.nunique()\n\n\n# # German league runner-ups\n# There are 6 football teams that have never won the league but they were on one or more occasions runner-ups: Alemannia Aachen, Bayer Leverkusen, Hertha Berliner, Meidericher, RB Leipzig, and Schalke 04. As shown below, Schalke 04 and Bayer Leverkusen have been particularly unlucky being runner-ups of the Bundesliga 7 and 5 times respectively. Additionally, we can also observe that Bayern Munich is the club that has been on more occasions runner-up of the Bundesliga (10 times).\n\n\n# Bundesliga runner-ups\nsecond_position = df_scores[df_scores['position'] == 2].club.value_counts()\n\n# plot labels indicating the number times the team was runner-up\nfor i, value in enumerate(second_position):\n plt.text(value, i, str(value), horizontalalignment='right',\n verticalalignment='center', weight='bold', color='white', fontsize=12)\n\n# plot the results using an horizontal bar plot\nsecond_position.plot(kind='barh', color='maroon')\n\n# ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# labels and title\nplt.xlabel('Number of times', fontsize=14)\nplt.ylabel('Football teams', fontsize=14)\nplt.title('Bundesliga runner-ups', fontsize=20)\n\n\n# In the season 2016\u201317, RB Leipzig finished second in the Bundesliga. The club was just founded in 2009 and is currently one of the leading teams in Germany mainly because of the significant investments made by the company Red Bull.\n\n\n# football teams that have never won the league but they were on one or more occasions runner-ups\nfirst_position = set(df_scores[df_scores['position'] == 1].club.unique())\n\nsecond_position = set(df_scores[df_scores['position'] == 2].club.unique())\n\nsecond_position.difference(first_position)\n\n\n# # Total number of seasons in the first\u00a0division (Top 10)\n# Werder Bremen holds the record for having played the most seasons in the German league. They have played the Bundesliga in 56 of its 57 seasons, being relegated to the second division only on one occasion. Bayern M\u00fcnchen has played in the Bundesliga uninterrupted since 1965, and Hamburger from 1963 until 2018, both of them 55 seasons in total. Apart from the aforementioned clubs, Borussia Dortmund, Stuttgart, Borussia M\u00f6nchengladbach, Schalke 04, and Eintracht Frankfurt have also participated in the German league more than 50 seasons.\n\n\n# total number of seasons in the first division (top 10)\nnumber_of_seasons = df_scores.groupby('club').count(\n).position.sort_values(ascending=False).head(10)\n\n# plot labels indicating the number of seasons in the first division\nfor i, value in enumerate(number_of_seasons):\n plt.text(i, value, str(value), horizontalalignment='center',\n verticalalignment='top', weight='bold', color='white', fontsize=14)\n\n# plot the results using an horizontal bar plot\nnumber_of_seasons.plot(kind='bar', color='green')\n\n# modify the ticks\nplt.xticks(fontsize=14)\nplt.yticks(fontsize=14)\n\n# define the title and the labels\nplt.title('Total number of seasons in the first division', fontsize=20)\nplt.xlabel('Club', fontsize=16)\nplt.ylabel('Number of seasons', fontsize=16)\n\n\n# In the last season (2019\u201320), all the teams from the image above played in the Bundesliga with the exception of Hamburger, Stuttgart, and Kaiserslautern.\n\n# # Number of teams in the Bundesliga per\u00a0season\n# The Bundesliga began with 16 teams in 1963 and it was enlarged to 18 teams in 1965. Since then, the number of clubs in the Bundesliga has remained unchanged with the exception of the season 1991\u201392. In that season, the league was temporarily expanded (20 teams) to accommodate the clubs from former East Germany.\n\n\n# number of teams in the Bundesliga per season\nnumber_clubs = df_scores.groupby('season').count().club\n", "project_metadata": {"full_name": "amandaiglesiasmoreno/bundesliga", "description": null, "topics": [], "git_url": "git://github.com/amandaiglesiasmoreno/bundesliga.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2020-08-25T19:08:51Z", "size": 1175, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 938497}, "last_updated": "2020-11-14T11:33:25Z"}, "intent": "# plot the results using a line plot"}, {"original_comment": "# normalize the final confusion matrix\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n#!/usr/bin/env python\n# coding: utf-8\nfrom keras.optimizers import Adam, Nadam\nfrom keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, Activation\nimport sys\nimport matplotlib.ticker as plticker\nimport tensorflow as tf\nfrom tensorflow.compat.v1 import InteractiveSession\nfrom tensorflow.compat.v1 import ConfigProto\nfrom keras.backend.tensorflow_backend import set_session\nfrom keras import regularizers\nfrom sklearn import preprocessing\nfrom numpy import linalg as la\nfrom sklearn.metrics import confusion_matrix\nimport itertools\nfrom keras.models import Sequential\nimport keras\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.callbacks import Callback\nfrom scipy.io import loadmat\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nconfig = ConfigProto()\nconfig.gpu_options.per_process_gpu_memory_fraction = 0.4\nconfig.gpu_options.allow_growth = True\nsession = InteractiveSession(config=config)\n\n\n# ### Confusion Matrix Function\n\n#%%\n\ndef plot_cm(cm, n):\n if n != 72:\n fig = plt.figure(figsize=(6.5, 6.5))\n ax = fig.add_subplot()\n plt.imshow(cm, cmap='Blues', interpolation='nearest')\n plt.colorbar()\n\n loc = plticker.MultipleLocator(base=1.0)\n ax.xaxis.set_major_locator(loc)\n ax.yaxis.set_major_locator(loc)\n\n if n == 6:\n ax.set_xticklabels([''] + [\"HB\", \"HF\", \"AB\", \"AF\", \"FV\", \"IO\"])\n ax.set_yticklabels([''] + [\"HB\", \"HF\", \"AB\", \"AF\", \"FV\", \"IO\"])\n elif n == 12:\n ax.set_xticklabels(\n [''] + [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\", \"11\", \"12\"])\n ax.set_yticklabels(\n [''] + [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\", \"11\", \"12\"])\n elif n == 2:\n ax.set_xticklabels([''] + [\"HF\", \"IO\"])\n ax.set_yticklabels([''] + [\"HF\", \"IO\"])\n\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, cm[i, j], horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n else:\n fig = plt.figure(figsize=(12, 12))\n ax = fig.add_subplot()\n plt.imshow(cm, cmap='Blues', interpolation='nearest')\n plt.colorbar()\n\n plt.title(\"Confusion matrix\")\n plt.ylabel('True')\n plt.xlabel('Predicted')\n\n\n# ### Simple CNN\n#\n# \u03a4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 \u03b1\u03c0\u03bb\u03cc cnn \u03b3\u03b9\u03b1 \u03cc\u03bb\u03b5\u03c2 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2. \u03a4\u03bf \u03ba\u03ac\u03b8\u03b5 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03bf \u03c4\u03bf \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03be\u03b5\u03c7\u03c9\u03c1\u03b9\u03c3\u03c4\u03ac \u03b3\u03b9\u03b1 \u03c4\u03bf\u03bd \u03ba\u03ac\u03b8\u03b5 \u03c3\u03c5\u03bc\u03bc\u03b5\u03c4\u03ad\u03c7\u03bf\u03bd\u03c4\u03b1 \u03ba\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1 \u03c0\u03b1\u03af\u03c1\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b1 \u03c4\u03b5\u03bb\u03b9\u03ba\u03ac \u03b1\u03c0\u03bf\u03c4\u03b5\u03bb\u03ad\u03c3\u03bc\u03b1\u03c4\u03b1. \u0394\u03b7\u03bb\u03b1\u03b4\u03ae, \u03c0\u03b1\u03af\u03c1\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03bc\u03ad\u03c3\u03b7 \u03c4\u03b9\u03bc\u03ae \u03c4\u03b7\u03c2 \u03b1\u03ba\u03c1\u03af\u03b2\u03b5\u03b9\u03b1\u03c2 \u03b3\u03b9\u03b1 \u03ba\u03ac\u03b8\u03b5 \u03c3\u03c5\u03bc\u03bc\u03b5\u03c4\u03ad\u03c7\u03bf\u03bd\u03c4\u03b1, \u03b5\u03bd\u03ce \u03b1\u03b8\u03c1\u03bf\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03cc\u03bb\u03bf\u03c5\u03c2 \u03c4\u03bf\u03c5\u03c2 confusion matrices \u03ba\u03b1\u03b9 \u03cd\u03c3\u03c4\u03b5\u03c1\u03b1 \u03c4\u03bf\u03c5\u03c2 \u03ba\u03b1\u03bd\u03bf\u03bd\u03b9\u03ba\u03bf\u03c0\u03bf\u03b9\u03bf\u03cd\u03bc\u03b5. \u0397 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1 \u03b1\u03c5\u03c4\u03ae \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03b5\u03c4\u03b1\u03b9 \u03c3\u03b5 \u03cc\u03bb\u03b1 \u03c4\u03b1 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03b1 \u03bd\u03b5\u03c5\u03c1\u03c9\u03bd\u03b9\u03ba\u03ce\u03bd \u03b4\u03b9\u03ba\u03c4\u03cd\u03c9\u03bd \u03c0\u03bf\u03c5 \u03b4\u03bf\u03ba\u03b9\u03bc\u03ac\u03c3\u03b1\u03bc\u03b5.\n\n#%%\n\ncm_cv1 = np.zeros((6, 6)) # simple cnn\ncm_cv2 = np.zeros((6, 6)) # pca and simple cnn\ncm_cv3 = np.zeros((6, 6)) # pca, deep cnn\ncm_cv4 = np.zeros((2, 2)) # binary, pca, dfnn\ncm_cv5 = np.zeros((2, 2)) # binary, simple cnn\ncm_cv6 = np.zeros((12, 12)) # hf, exemplar, simple cnn\ncm_cv7 = np.zeros((72, 72)) # deep cnn\n\nfiles = [\"data/S1.mat\", \"data/S2.mat\", \"data/S3.mat\", \"data/S4.mat\", \"data/S5.mat\", \"data/S6.mat\",\n \"data/S7.mat\", \"data/S8.mat\", \"data/S9.mat\", \"data/S10.mat\"]\nN = 32\nelectrodes = 124\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n\n X = X_2D\n y = np.array(mat[\"categoryLabels\"]).ravel() # get labels\n\n # create train and test sets\n X_training = X[:int(0.8*len(X))]\n X_validation = X[int(0.8*len(X)):]\n\n y_training = y[:int(0.8*len(X))]\n y_validation = y[int(0.8*len(X)):]\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n # reshape to treat the data like images (124x32)\n X_training = np.reshape(X_training, (-1, electrodes, N, 1))\n X_validation = np.reshape(X_validation, (-1, electrodes, N, 1))\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-7),\n activity_regularizer=regularizers.l2(1e-7),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(num_classes, activity_regularizer=regularizers.l2(\n 1e-6), activation=\"softmax\"))\n\n model.compile(loss='categorical_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the comfusion matrix\n cnf_matrix1 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv1 += cnf_matrix1\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv1, 1)\nfor i in range(0, 6):\n cm_cv1[i, :] = cm_cv1[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv1, 4), 6)\n\n#%%\n\ncm_cv1\n\n\n# ### PCA with Simple CNN\n#\n# \u0395\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03af\u03b4\u03b9\u03bf cnn \u03bc\u03b5 \u03c0\u03c1\u03b9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, \u03c9\u03c3\u03c4\u03cc\u03c3\u03bf \u03c4\u03ce\u03c1\u03b1 \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03ba\u03b1\u03b9 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180, \u03ba\u03b1\u03b8\u03ce\u03c2 \u03b1\u03c0\u03cc \u03c4\u03b7 \u03c3\u03c5\u03bd\u03ac\u03c1\u03c4\u03b7\u03c3\u03b7 lda \u03c0\u03bf\u03c5 \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c6\u03c4\u03b9\u03ac\u03be\u03b5\u03b9, \u03c4\u03bf \u03b2\u03ad\u03bb\u03c4\u03b9\u03c3\u03c4\u03bf k \u03ad\u03b2\u03b3\u03b1\u03b9\u03bd\u03b5 \u03c0\u03bf\u03bb\u03bb\u03ad\u03c2 \u03c6\u03bf\u03c1\u03ad\u03c2 \u03b3\u03cd\u03c1\u03c9 \u03c3\u03c4\u03bf 180.\n\n#%%\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n categoryLabels = np.array(mat[\"categoryLabels\"]) # get labels\n\n k = 180\n X = X_2D.copy()\n y = categoryLabels.ravel()\n X -= np.mean(X, axis=0)\n\n # pca with svd, using an optimum k\n [u, s, v] = la.svd(X)\n v = v.transpose()\n v_new = v[:, :k]\n X_pca = np.dot(X, v_new)\n\n # create train and test sets\n X_training = X_pca[:int(0.8*len(X_pca))]\n X_validation = X_pca[int(0.8*len(X_pca)):]\n\n y_training = y[:int(0.8*len(X_pca))]\n y_validation = y[int(0.8*len(X_pca)):]\n\n X_training = np.reshape(X_training, (-1, 30, 6, 1))\n X_validation = np.reshape(X_validation, (-1, 30, 6, 1))\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(units=128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-3),\n activity_regularizer=regularizers.l2(1e-3),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(units=num_classes, activity_regularizer=regularizers.l2(\n 1e-5), activation=\"softmax\"))\n\n model.compile(loss='categorical_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the confusion matrix\n cnf_matrix2 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv2 += cnf_matrix2\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv2, 1)\nfor i in range(0, 6):\n cm_cv2[i, :] = cm_cv2[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv2, 4), 6)\n\n#%%\n\ncm_cv2\n\n\n# ### PCA with Deep CNN\n#\n# \u03a6\u03c4\u03b9\u03ac\u03c7\u03bd\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 Deep CNN \u03ba\u03b1\u03b9 \u03c4\u03bf \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c0\u03ac\u03bb\u03b9 \u03b3\u03b9\u03b1 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, \u03b5\u03bd\u03ce \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180\n\n#%%\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n categoryLabels = np.array(mat[\"categoryLabels\"]) # get labels\n\n k = 180\n X = X_2D.copy()\n y = categoryLabels.ravel()\n X -= np.mean(X, axis=0)\n\n # pca with svd, using an optimum k\n [u, s, v] = la.svd(X)\n v = v.transpose()\n v_new = v[:, :k]\n X_pca = np.dot(X, v_new)\n\n X_training = X[:int(0.8*len(X))]\n X_validation = X[int(0.8*len(X)):]\n\n y_training = y[:int(0.8*len(X))]\n y_validation = y[int(0.8*len(X)):]\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # We subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n X_training = np.reshape(X_training, (-1, electrodes, N, 1))\n X_validation = np.reshape(X_validation, (-1, electrodes, N, 1))\n\n model = Sequential()\n\n model.add(Conv2D(8, kernel_size=3, input_shape=(\n 124, 32, 1), activation='relu'))\n model.add(Conv2D(8, kernel_size=3, activation='relu'))\n model.add(MaxPool2D(2, 2))\n model.add(BatchNormalization())\n\n model.add(Conv2D(16, kernel_size=2, activation='relu'))\n model.add(Conv2D(16, kernel_size=2, activation='relu'))\n model.add(MaxPool2D(2, 2))\n model.add(BatchNormalization())\n\n model.add(Conv2D(64, kernel_size=3, activation='relu'))\n model.add(BatchNormalization())\n\n model.add(Flatten())\n model.add(Dense(32, activation=\"relu\"))\n model.add(Dense(16, activation=\"relu\"))\n model.add(Dense(6, activation=\"softmax\"))\n\n optimizer = Nadam(lr=0.004)\n model.compile(optimizer=optimizer,\n loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the confusion matrix\n cnf_matrix3 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv3 += cnf_matrix3\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv3, 1)\nfor i in range(0, 6):\n cm_cv3[i, :] = cm_cv3[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv3, 4), 6)\n\n#%%\n\ncm_cv3\n\n\n# ### PCA with DFNN, 2 classes\n#\n# \u0391\u03c1\u03c7\u03b9\u03ba\u03ac \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180. \u03a3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1, \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03bf DFNN \u03c3\u03c4\u03b9\u03c2 \u03b4\u03cd\u03bf \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, human face \u03ba\u03b1\u03b9 inanimate object.\n\n#%%\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X = np.array(mat[\"X_2D\"])\n y = np.array(mat[\"exemplarLabels\"]).ravel() # get labels\n\n y_bin = []\n X_bin = []\n\n for i in range(0, len(X)): # keep only the hf and the io category\n if (12 < y[i] < 25):\n y_bin.append(0) # set class hf as 0\n X_bin.append(X[i])\n elif (60 < y[i]): # set class io as 1\n y_bin.append(1)\n X_bin.append(X[i])\n\n X_bin = np.array(X_bin)\n y_bin = np.array(y_bin).ravel()\n\n # pca with svd using an optimum k\n [u, s, v] = la.svd(X_bin)\n v = v.transpose()\n v_new = v[:, :180]\n X_bin = np.dot(X_bin, v_new)\n\n X_training = X_bin[:int(0.8*len(X_bin))] # create train and test sets\n X_validation = X_bin[int(0.8*len(X_bin)):]\n\n y_training = y_bin[:int(0.8*len(X_bin))]\n y_validation = y_bin[int(0.8*len(X_bin)):]\n\n num_classes = 2 # we have only 2 classes, hf and io\n\n # dfnn model\n model = Sequential()\n model.add(Dense(160, input_dim=180, activation='relu'))\n model.add(Dense(140, activation='tanh',\n activity_regularizer=regularizers.l1(1e-3)))\n model.add(Dense(120, activation='tanh',\n kernel_regularizer=regularizers.l2(1e-5)))\n model.add(Dense(64, input_dim=8, activation='tanh'))\n model.add(Dense(32, activation='relu',\n activity_regularizer=regularizers.l2(1e-7)))\n model.add(Dense(1, activation='sigmoid'))\n\n model.compile(loss='binary_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation),\n shuffle=True)\n\n _, accuracy = model.evaluate(X_validation, y_validation)\n print('Accuracy: %.2f' % (accuracy*100))\n\n y_validation_predictions = np.round(model.predict(\n X_validation, verbose=1)) # make predictions\n\n # create the confusion matrix\n cnf_matrix4 = confusion_matrix(y_validation, y_validation_predictions)\n # add together all the confusion matrices\n cm_cv4 += cnf_matrix4\n\n _, accuracy = model.evaluate(\n X_validation, y_validation) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv4, 1)\nfor i in range(0, 2):\n cm_cv4[i, :] = cm_cv4[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv4, 4), 2)\n\n#%%\n\ncm_cv4\n\n\n# ### Simple CNN, 2 classes\n#\n# \u0395\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03b1\u03c0\u03bb\u03cc cnn \u03c3\u03c4\u03b9\u03c2 \u03b4\u03cd\u03bf \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, human face \u03ba\u03b1\u03b9 inanimate object.\n\n#%%\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X = np.array(mat[\"X_2D\"])\n y = np.array(mat[\"exemplarLabels\"]).ravel() # get labels\n\n y_bin = []\n X_bin = []\n\n for i in range(0, len(X)): # keep only the hf and the io category\n if (12 < y[i] < 25):\n y_bin.append(0) # set class hf as 0\n X_bin.append(X[i])\n elif (60 < y[i]): # set class io as 0\n y_bin.append(1)\n X_bin.append(X[i])\n\n X_bin = np.array(X_bin)\n y_bin = np.array(y_bin).ravel()\n\n X_training = X_bin[:int(0.8*len(X_bin))] # create train and test sets\n X_validation = X_bin[int(0.8*len(X_bin)):]\n\n y_training = y_bin[:int(0.8*len(X_bin))]\n y_validation = y_bin[int(0.8*len(X_bin)):]\n\n # reshape to treat the data like images (124x32)\n X_training = np.reshape(X_training, (-1, 124, 32, 1))\n X_validation = np.reshape(X_validation, (-1, 124, 32, 1))\n\n num_classes = 2 # we have only 2 classes, hf and io\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(units=128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-3),\n activity_regularizer=regularizers.l2(1e-3),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(units=num_classes-1,\n activity_regularizer=regularizers.l2(1e-5), activation=\"sigmoid\"))\n\n model.compile(loss=keras.losses.BinaryCrossentropy(),\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation),\n shuffle=True)\n\n y_validation_predictions = np.round(model.predict(\n X_validation, verbose=1)) # make predictions\n\n # create the confusion matrix\n cnf_matrix5 = confusion_matrix(y_validation, y_validation_predictions)\n # add together all the confusion matrices\n cm_cv5 += cnf_matrix5\n\n _, accuracy = model.evaluate(\n X_validation, y_validation) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy", "target_code": "sum_by_row = np.sum(cm_cv5, 1)\nfor i in range(0, 2):\n cm_cv5[i, :] = cm_cv5[i, :] / sum_by_row[i]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n#!/usr/bin/env python\n# coding: utf-8\nfrom keras.optimizers import Adam, Nadam\nfrom keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, Activation\nimport sys\nimport matplotlib.ticker as plticker\nimport tensorflow as tf\nfrom tensorflow.compat.v1 import InteractiveSession\nfrom tensorflow.compat.v1 import ConfigProto\nfrom keras.backend.tensorflow_backend import set_session\nfrom keras import regularizers\nfrom sklearn import preprocessing\nfrom numpy import linalg as la\nfrom sklearn.metrics import confusion_matrix\nimport itertools\nfrom keras.models import Sequential\nimport keras\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.callbacks import Callback\nfrom scipy.io import loadmat\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nconfig = ConfigProto()\nconfig.gpu_options.per_process_gpu_memory_fraction = 0.4\nconfig.gpu_options.allow_growth = True\nsession = InteractiveSession(config=config)\n\n\n# ### Confusion Matrix Function\n\n\ndef plot_cm(cm, n):\n if n != 72:\n fig = plt.figure(figsize=(6.5, 6.5))\n ax = fig.add_subplot()\n plt.imshow(cm, cmap='Blues', interpolation='nearest')\n plt.colorbar()\n\n loc = plticker.MultipleLocator(base=1.0)\n ax.xaxis.set_major_locator(loc)\n ax.yaxis.set_major_locator(loc)\n\n if n == 6:\n ax.set_xticklabels([''] + [\"HB\", \"HF\", \"AB\", \"AF\", \"FV\", \"IO\"])\n ax.set_yticklabels([''] + [\"HB\", \"HF\", \"AB\", \"AF\", \"FV\", \"IO\"])\n elif n == 12:\n ax.set_xticklabels(\n [''] + [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\", \"11\", \"12\"])\n ax.set_yticklabels(\n [''] + [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\", \"11\", \"12\"])\n elif n == 2:\n ax.set_xticklabels([''] + [\"HF\", \"IO\"])\n ax.set_yticklabels([''] + [\"HF\", \"IO\"])\n\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, cm[i, j], horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n else:\n fig = plt.figure(figsize=(12, 12))\n ax = fig.add_subplot()\n plt.imshow(cm, cmap='Blues', interpolation='nearest')\n plt.colorbar()\n\n plt.title(\"Confusion matrix\")\n plt.ylabel('True')\n plt.xlabel('Predicted')\n\n\n# ### Simple CNN\n#\n# \u03a4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 \u03b1\u03c0\u03bb\u03cc cnn \u03b3\u03b9\u03b1 \u03cc\u03bb\u03b5\u03c2 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2. \u03a4\u03bf \u03ba\u03ac\u03b8\u03b5 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03bf \u03c4\u03bf \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03be\u03b5\u03c7\u03c9\u03c1\u03b9\u03c3\u03c4\u03ac \u03b3\u03b9\u03b1 \u03c4\u03bf\u03bd \u03ba\u03ac\u03b8\u03b5 \u03c3\u03c5\u03bc\u03bc\u03b5\u03c4\u03ad\u03c7\u03bf\u03bd\u03c4\u03b1 \u03ba\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1 \u03c0\u03b1\u03af\u03c1\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b1 \u03c4\u03b5\u03bb\u03b9\u03ba\u03ac \u03b1\u03c0\u03bf\u03c4\u03b5\u03bb\u03ad\u03c3\u03bc\u03b1\u03c4\u03b1. \u0394\u03b7\u03bb\u03b1\u03b4\u03ae, \u03c0\u03b1\u03af\u03c1\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03bc\u03ad\u03c3\u03b7 \u03c4\u03b9\u03bc\u03ae \u03c4\u03b7\u03c2 \u03b1\u03ba\u03c1\u03af\u03b2\u03b5\u03b9\u03b1\u03c2 \u03b3\u03b9\u03b1 \u03ba\u03ac\u03b8\u03b5 \u03c3\u03c5\u03bc\u03bc\u03b5\u03c4\u03ad\u03c7\u03bf\u03bd\u03c4\u03b1, \u03b5\u03bd\u03ce \u03b1\u03b8\u03c1\u03bf\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03cc\u03bb\u03bf\u03c5\u03c2 \u03c4\u03bf\u03c5\u03c2 confusion matrices \u03ba\u03b1\u03b9 \u03cd\u03c3\u03c4\u03b5\u03c1\u03b1 \u03c4\u03bf\u03c5\u03c2 \u03ba\u03b1\u03bd\u03bf\u03bd\u03b9\u03ba\u03bf\u03c0\u03bf\u03b9\u03bf\u03cd\u03bc\u03b5. \u0397 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1 \u03b1\u03c5\u03c4\u03ae \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03b5\u03c4\u03b1\u03b9 \u03c3\u03b5 \u03cc\u03bb\u03b1 \u03c4\u03b1 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03b1 \u03bd\u03b5\u03c5\u03c1\u03c9\u03bd\u03b9\u03ba\u03ce\u03bd \u03b4\u03b9\u03ba\u03c4\u03cd\u03c9\u03bd \u03c0\u03bf\u03c5 \u03b4\u03bf\u03ba\u03b9\u03bc\u03ac\u03c3\u03b1\u03bc\u03b5.\n\n\ncm_cv1 = np.zeros((6, 6)) # simple cnn\ncm_cv2 = np.zeros((6, 6)) # pca and simple cnn\ncm_cv3 = np.zeros((6, 6)) # pca, deep cnn\ncm_cv4 = np.zeros((2, 2)) # binary, pca, dfnn\ncm_cv5 = np.zeros((2, 2)) # binary, simple cnn\ncm_cv6 = np.zeros((12, 12)) # hf, exemplar, simple cnn\ncm_cv7 = np.zeros((72, 72)) # deep cnn\n\nfiles = [\"data/S1.mat\", \"data/S2.mat\", \"data/S3.mat\", \"data/S4.mat\", \"data/S5.mat\", \"data/S6.mat\",\n \"data/S7.mat\", \"data/S8.mat\", \"data/S9.mat\", \"data/S10.mat\"]\nN = 32\nelectrodes = 124\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n\n X = X_2D\n y = np.array(mat[\"categoryLabels\"]).ravel() # get labels\n\n # create train and test sets\n X_training = X[:int(0.8*len(X))]\n X_validation = X[int(0.8*len(X)):]\n\n y_training = y[:int(0.8*len(X))]\n y_validation = y[int(0.8*len(X)):]\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n # reshape to treat the data like images (124x32)\n X_training = np.reshape(X_training, (-1, electrodes, N, 1))\n X_validation = np.reshape(X_validation, (-1, electrodes, N, 1))\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-7),\n activity_regularizer=regularizers.l2(1e-7),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(num_classes, activity_regularizer=regularizers.l2(\n 1e-6), activation=\"softmax\"))\n\n model.compile(loss='categorical_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the comfusion matrix\n cnf_matrix1 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv1 += cnf_matrix1\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv1, 1)\nfor i in range(0, 6):\n cm_cv1[i, :] = cm_cv1[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv1, 4), 6)\n\n\ncm_cv1\n\n\n# ### PCA with Simple CNN\n#\n# \u0395\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03af\u03b4\u03b9\u03bf cnn \u03bc\u03b5 \u03c0\u03c1\u03b9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, \u03c9\u03c3\u03c4\u03cc\u03c3\u03bf \u03c4\u03ce\u03c1\u03b1 \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03ba\u03b1\u03b9 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180, \u03ba\u03b1\u03b8\u03ce\u03c2 \u03b1\u03c0\u03cc \u03c4\u03b7 \u03c3\u03c5\u03bd\u03ac\u03c1\u03c4\u03b7\u03c3\u03b7 lda \u03c0\u03bf\u03c5 \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c6\u03c4\u03b9\u03ac\u03be\u03b5\u03b9, \u03c4\u03bf \u03b2\u03ad\u03bb\u03c4\u03b9\u03c3\u03c4\u03bf k \u03ad\u03b2\u03b3\u03b1\u03b9\u03bd\u03b5 \u03c0\u03bf\u03bb\u03bb\u03ad\u03c2 \u03c6\u03bf\u03c1\u03ad\u03c2 \u03b3\u03cd\u03c1\u03c9 \u03c3\u03c4\u03bf 180.\n\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n categoryLabels = np.array(mat[\"categoryLabels\"]) # get labels\n\n k = 180\n X = X_2D.copy()\n y = categoryLabels.ravel()\n X -= np.mean(X, axis=0)\n\n # pca with svd, using an optimum k\n [u, s, v] = la.svd(X)\n v = v.transpose()\n v_new = v[:, :k]\n X_pca = np.dot(X, v_new)\n\n # create train and test sets\n X_training = X_pca[:int(0.8*len(X_pca))]\n X_validation = X_pca[int(0.8*len(X_pca)):]\n\n y_training = y[:int(0.8*len(X_pca))]\n y_validation = y[int(0.8*len(X_pca)):]\n\n X_training = np.reshape(X_training, (-1, 30, 6, 1))\n X_validation = np.reshape(X_validation, (-1, 30, 6, 1))\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(units=128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-3),\n activity_regularizer=regularizers.l2(1e-3),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(units=num_classes, activity_regularizer=regularizers.l2(\n 1e-5), activation=\"softmax\"))\n\n model.compile(loss='categorical_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the confusion matrix\n cnf_matrix2 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv2 += cnf_matrix2\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv2, 1)\nfor i in range(0, 6):\n cm_cv2[i, :] = cm_cv2[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv2, 4), 6)\n\n\ncm_cv2\n\n\n# ### PCA with Deep CNN\n#\n# \u03a6\u03c4\u03b9\u03ac\u03c7\u03bd\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 Deep CNN \u03ba\u03b1\u03b9 \u03c4\u03bf \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c0\u03ac\u03bb\u03b9 \u03b3\u03b9\u03b1 \u03c4\u03b9\u03c2 6 \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, \u03b5\u03bd\u03ce \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180\n\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X_2D = np.array(mat[\"X_2D\"])\n categoryLabels = np.array(mat[\"categoryLabels\"]) # get labels\n\n k = 180\n X = X_2D.copy()\n y = categoryLabels.ravel()\n X -= np.mean(X, axis=0)\n\n # pca with svd, using an optimum k\n [u, s, v] = la.svd(X)\n v = v.transpose()\n v_new = v[:, :k]\n X_pca = np.dot(X, v_new)\n\n X_training = X[:int(0.8*len(X))]\n X_validation = X[int(0.8*len(X)):]\n\n y_training = y[:int(0.8*len(X))]\n y_validation = y[int(0.8*len(X)):]\n\n num_classes = 6\n y_training1hot = keras.utils.to_categorical(\n y_training - 1, num_classes) # We subtract 1 to convert to 0-index\n y_validation1hot = keras.utils.to_categorical(\n y_validation - 1, num_classes)\n\n X_training = np.reshape(X_training, (-1, electrodes, N, 1))\n X_validation = np.reshape(X_validation, (-1, electrodes, N, 1))\n\n model = Sequential()\n\n model.add(Conv2D(8, kernel_size=3, input_shape=(\n 124, 32, 1), activation='relu'))\n model.add(Conv2D(8, kernel_size=3, activation='relu'))\n model.add(MaxPool2D(2, 2))\n model.add(BatchNormalization())\n\n model.add(Conv2D(16, kernel_size=2, activation='relu'))\n model.add(Conv2D(16, kernel_size=2, activation='relu'))\n model.add(MaxPool2D(2, 2))\n model.add(BatchNormalization())\n\n model.add(Conv2D(64, kernel_size=3, activation='relu'))\n model.add(BatchNormalization())\n\n model.add(Flatten())\n model.add(Dense(32, activation=\"relu\"))\n model.add(Dense(16, activation=\"relu\"))\n model.add(Dense(6, activation=\"softmax\"))\n\n optimizer = Nadam(lr=0.004)\n model.compile(optimizer=optimizer,\n loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n\n model.fit(X_training, y_training1hot, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation1hot),\n shuffle=True)\n\n y_validation_predictions = model.predict(\n X_validation, verbose=1) # make predictions\n\n # create the confusion matrix\n cnf_matrix3 = confusion_matrix(\n y_validation-1, np.argmax(y_validation_predictions, axis=1))\n # add together all the confusion matrices\n cm_cv3 += cnf_matrix3\n\n _, accuracy = model.evaluate(\n X_validation, y_validation1hot) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv3, 1)\nfor i in range(0, 6):\n cm_cv3[i, :] = cm_cv3[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv3, 4), 6)\n\n\ncm_cv3\n\n\n# ### PCA with DFNN, 2 classes\n#\n# \u0391\u03c1\u03c7\u03b9\u03ba\u03ac \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 PCA \u03bc\u03b5 svd \u03c0\u03c1\u03b9\u03bd, \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ce\u03bd\u03c4\u03b1\u03c2 k=180. \u03a3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1, \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1 \u03bc\u03bf\u03bd\u03c4\u03ad\u03bb\u03bf DFNN \u03c3\u03c4\u03b9\u03c2 \u03b4\u03cd\u03bf \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, human face \u03ba\u03b1\u03b9 inanimate object.\n\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X = np.array(mat[\"X_2D\"])\n y = np.array(mat[\"exemplarLabels\"]).ravel() # get labels\n\n y_bin = []\n X_bin = []\n\n for i in range(0, len(X)): # keep only the hf and the io category\n if (12 < y[i] < 25):\n y_bin.append(0) # set class hf as 0\n X_bin.append(X[i])\n elif (60 < y[i]): # set class io as 1\n y_bin.append(1)\n X_bin.append(X[i])\n\n X_bin = np.array(X_bin)\n y_bin = np.array(y_bin).ravel()\n\n # pca with svd using an optimum k\n [u, s, v] = la.svd(X_bin)\n v = v.transpose()\n v_new = v[:, :180]\n X_bin = np.dot(X_bin, v_new)\n\n X_training = X_bin[:int(0.8*len(X_bin))] # create train and test sets\n X_validation = X_bin[int(0.8*len(X_bin)):]\n\n y_training = y_bin[:int(0.8*len(X_bin))]\n y_validation = y_bin[int(0.8*len(X_bin)):]\n\n num_classes = 2 # we have only 2 classes, hf and io\n\n # dfnn model\n model = Sequential()\n model.add(Dense(160, input_dim=180, activation='relu'))\n model.add(Dense(140, activation='tanh',\n activity_regularizer=regularizers.l1(1e-3)))\n model.add(Dense(120, activation='tanh',\n kernel_regularizer=regularizers.l2(1e-5)))\n model.add(Dense(64, input_dim=8, activation='tanh'))\n model.add(Dense(32, activation='relu',\n activity_regularizer=regularizers.l2(1e-7)))\n model.add(Dense(1, activation='sigmoid'))\n\n model.compile(loss='binary_crossentropy',\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation),\n shuffle=True)\n\n _, accuracy = model.evaluate(X_validation, y_validation)\n print('Accuracy: %.2f' % (accuracy*100))\n\n y_validation_predictions = np.round(model.predict(\n X_validation, verbose=1)) # make predictions\n\n # create the confusion matrix\n cnf_matrix4 = confusion_matrix(y_validation, y_validation_predictions)\n # add together all the confusion matrices\n cm_cv4 += cnf_matrix4\n\n _, accuracy = model.evaluate(\n X_validation, y_validation) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n\n# normalize the final confusion matrix\nsum_by_row = np.sum(cm_cv4, 1)\nfor i in range(0, 2):\n cm_cv4[i, :] = cm_cv4[i, :] / sum_by_row[i]\n\n# plot the final confusion matrix\nplot_cm(np.round(cm_cv4, 4), 2)\n\n\ncm_cv4\n\n\n# ### Simple CNN, 2 classes\n#\n# \u0395\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03b1\u03c0\u03bb\u03cc cnn \u03c3\u03c4\u03b9\u03c2 \u03b4\u03cd\u03bf \u03ba\u03bb\u03ac\u03c3\u03b5\u03b9\u03c2, human face \u03ba\u03b1\u03b9 inanimate object.\n\n\nkeep_accuracy = np.zeros((10))\nj = 0\n\nfor participant in files:\n mat = loadmat(participant)\n X = np.array(mat[\"X_2D\"])\n y = np.array(mat[\"exemplarLabels\"]).ravel() # get labels\n\n y_bin = []\n X_bin = []\n\n for i in range(0, len(X)): # keep only the hf and the io category\n if (12 < y[i] < 25):\n y_bin.append(0) # set class hf as 0\n X_bin.append(X[i])\n elif (60 < y[i]): # set class io as 0\n y_bin.append(1)\n X_bin.append(X[i])\n\n X_bin = np.array(X_bin)\n y_bin = np.array(y_bin).ravel()\n\n X_training = X_bin[:int(0.8*len(X_bin))] # create train and test sets\n X_validation = X_bin[int(0.8*len(X_bin)):]\n\n y_training = y_bin[:int(0.8*len(X_bin))]\n y_validation = y_bin[int(0.8*len(X_bin)):]\n\n # reshape to treat the data like images (124x32)\n X_training = np.reshape(X_training, (-1, 124, 32, 1))\n X_validation = np.reshape(X_validation, (-1, 124, 32, 1))\n\n num_classes = 2 # we have only 2 classes, hf and io\n\n # cnn model\n model = Sequential()\n\n model.add(\n Conv2D(32, (3, 3), input_shape=X_training.shape[1:], activation=\"relu\"))\n model.add(Flatten())\n model.add(Dense(units=128, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),\n bias_regularizer=regularizers.l2(1e-3),\n activity_regularizer=regularizers.l2(1e-3),\n activation=\"relu\"))\n model.add(Dropout(0.15))\n model.add(Dense(units=num_classes-1,\n activity_regularizer=regularizers.l2(1e-5), activation=\"sigmoid\"))\n\n model.compile(loss=keras.losses.BinaryCrossentropy(),\n optimizer='nadam',\n metrics=['accuracy'])\n\n model.fit(X_training, y_training, # train the model\n epochs=50,\n validation_data=(X_validation, y_validation),\n shuffle=True)\n\n y_validation_predictions = np.round(model.predict(\n X_validation, verbose=1)) # make predictions\n\n # create the confusion matrix\n cnf_matrix5 = confusion_matrix(y_validation, y_validation_predictions)\n # add together all the confusion matrices\n cm_cv5 += cnf_matrix5\n\n _, accuracy = model.evaluate(\n X_validation, y_validation) # find accuracy\n keep_accuracy[j] = accuracy\n j = j + 1\n print(\" \")\n\nprint('Accuracy: %.2f' % (np.mean(keep_accuracy)*100)\n ) # print mean accuracy\n", "project_metadata": {"full_name": "dzerkes/A-Representational-Similarity-Analysis-of-the-Dynamics-of-Object-Processing-Using-Single--Trial-EEG-", "description": "Paper Reproducion of \"A Representational Similarity Analysis of the Dynamics of Object Processing Using Single- Trial EEG Classification\" . Also use of deep learning techniques instead of LDA,SVM-RBF that is proposed in the paper", "topics": [], "git_url": "git://github.com/dzerkes/A-Representational-Similarity-Analysis-of-the-Dynamics-of-Object-Processing-Using-Single--Trial-EEG-.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-07-02T19:24:51Z", "size": 8720, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5633909}, "last_updated": "2020-11-26T04:51:05Z"}, "intent": "# normalize the final confusion matrix"}, {"original_comment": " # normalize\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom itertools import count\nfrom random import random\nimport numpy as np\n\n#%%\n\ndef get_random_points(n):\n return np.random.uniform(-1, 1, [n, 2])\n\n\ndef get_random_line_vector():\n rp = np.random.uniform(-1, 1, (2, 2))\n slope = (rp[0, 1] - rp[1, 1]) / (rp[0, 0] - rp[1, 0])\n intercept = rp[0, 1] - rp[0, 0] * slope\n return [intercept, slope, 1]\n\n\ndef get_labels(pi, vector):\n labels = np.sign(np.dot(pi, vector))\n return labels\n\n\ndef add_intercept(points):\n pi = np.array([[1, x1, x2] for (x1, x2) in points])\n return pi\n\n#%%\n\ndef get_initial_w():\n w = np.array([0, 0, 0.001])\n return w\n\n#%%\n\ndef get_perceptron_predictions(pi, labels, w):\n return np.sign(np.dot(pi, w)) == labels\n\n\ndef update_w(w, mp, label):\n return w + label*mp\n\n#%%\n\ndef train_perceptron(pi, labels, w):\n for num_iterations in count(start=0, step=1):\n predictions = get_perceptron_predictions(pi, labels, w)\n\n if predictions.all():\n break\n\n missclassified_points_indexes = np.where(predictions == False)[0]\n k = np.random.choice(missclassified_points_indexes, 1)[0]\n w = update_w(w, pi[k], labels[k])\n# print(w)\n return w, num_iterations\n\n#%%\n\nN_TEST = 10000\npoints_test = get_random_points(N_TEST) # hack for speed\npi_test = add_intercept(points_test)\n\n\ndef evaluate_solution(w, fx_vector):\n labels_test = get_labels(pi_test, fx_vector)\n return get_perceptron_predictions(pi_test, labels_test, w).sum() / N_TEST\n\n\n# # SVM\n\n#%%\n\ndef fit_svm(x, y):\n NUM = x.shape[0]\n DIM = x.shape[1]\n # we'll solve the dual\n # obtain the kernel\n K = y[:, None] * x\n K = np.dot(K, K.T)\n P = matrix(K)\n q = matrix(-np.ones((NUM, 1)))\n G = matrix(-np.eye(NUM))\n h = matrix(np.zeros(NUM))\n A = matrix(y.reshape(1, -1))\n b = matrix(np.zeros(1))\n solvers.options['show_progress'] = False\n sol = solvers.qp(P, q, G, h, A, b)\n alphas = np.array(sol['x'])\n return alphas\n\n#%%\n\ndef get_w_svm(alphas, points, labels):\n # get weights\n w_svm = np.sum(alphas * labels[:, None] * points, axis=0)\n # get bias\n cond = (alphas > 1e-4).reshape(-1)\n b = labels[cond] - np.dot(points[cond], w_svm)\n bias = b[0]", "target_code": " norm = np.linalg.norm(w_svm)\n w_svm_norm, bias_norm = w_svm / norm, bias / norm\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport matplotlib.pyplot as plt\nfrom itertools import count\nfrom random import random\nimport numpy as np\n\n\ndef get_random_points(n):\n return np.random.uniform(-1, 1, [n, 2])\n\n\ndef get_random_line_vector():\n rp = np.random.uniform(-1, 1, (2, 2))\n slope = (rp[0, 1] - rp[1, 1]) / (rp[0, 0] - rp[1, 0])\n intercept = rp[0, 1] - rp[0, 0] * slope\n return [intercept, slope, 1]\n\n\ndef get_labels(pi, vector):\n labels = np.sign(np.dot(pi, vector))\n return labels\n\n\ndef add_intercept(points):\n pi = np.array([[1, x1, x2] for (x1, x2) in points])\n return pi\n\n\ndef get_initial_w():\n w = np.array([0, 0, 0.001])\n return w\n\n\ndef get_perceptron_predictions(pi, labels, w):\n return np.sign(np.dot(pi, w)) == labels\n\n\ndef update_w(w, mp, label):\n return w + label*mp\n\n\ndef train_perceptron(pi, labels, w):\n for num_iterations in count(start=0, step=1):\n predictions = get_perceptron_predictions(pi, labels, w)\n\n if predictions.all():\n break\n\n missclassified_points_indexes = np.where(predictions == False)[0]\n k = np.random.choice(missclassified_points_indexes, 1)[0]\n w = update_w(w, pi[k], labels[k])\n# print(w)\n return w, num_iterations\n\n\nN_TEST = 10000\npoints_test = get_random_points(N_TEST) # hack for speed\npi_test = add_intercept(points_test)\n\n\ndef evaluate_solution(w, fx_vector):\n labels_test = get_labels(pi_test, fx_vector)\n return get_perceptron_predictions(pi_test, labels_test, w).sum() / N_TEST\n\n\n# # SVM\n\n\ndef fit_svm(x, y):\n NUM = x.shape[0]\n DIM = x.shape[1]\n # we'll solve the dual\n # obtain the kernel\n K = y[:, None] * x\n K = np.dot(K, K.T)\n P = matrix(K)\n q = matrix(-np.ones((NUM, 1)))\n G = matrix(-np.eye(NUM))\n h = matrix(np.zeros(NUM))\n A = matrix(y.reshape(1, -1))\n b = matrix(np.zeros(1))\n solvers.options['show_progress'] = False\n sol = solvers.qp(P, q, G, h, A, b)\n alphas = np.array(sol['x'])\n return alphas\n\n\ndef get_w_svm(alphas, points, labels):\n # get weights\n w_svm = np.sum(alphas * labels[:, None] * points, axis=0)\n # get bias\n cond = (alphas > 1e-4).reshape(-1)\n b = labels[cond] - np.dot(points[cond], w_svm)\n bias = b[0]\n", "project_metadata": {"full_name": "stathius/learning_from_data", "description": "Exercise for the book/course Learning from Data", "topics": [], "git_url": "git://github.com/stathius/learning_from_data.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2016-10-13T13:39:18Z", "size": 917, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 575625, "Python": 1540}, "last_updated": "2020-08-14T15:03:14Z"}, "intent": " # normalize"}, {"original_comment": "# Check if the video opened successfully otherwise exit\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Document Details\n# ### Author: Eric Medwedeff\n# ### Date: 4/2/2019\n# ### Program Name: Video2Image\n#\n# ### Details:\n# This program reads in a video from the user specified directory and writes the output to the user specified directory.\n\n# ## First Step is to get our Imports\n\n#%%\n\nimport cv2 # For video processing\nimport numpy as np # For numerical tasks\nimport logging # For tracking purposes to be added later\nimport sys # For our nifty print\n\n\n# ### A nifty little printer to make our output tidy and some helper function(s)\n\n#%%\n\ndef printer(index, length):\n sys.stdout.write(f\"\\rProcessing sample {index+1} of {length}\")\n sys.stdout.flush()\n\n\ndef saveImage(outputPath, frame, count):\n outputNamePath = (\"{}{}.jpeg\").format(outputPath, str(count).zfill(6))\n cv2.imwrite(outputNamePath, frame)\n\n\n# ## Second step is to load our args and the video\n\n#%%\n\nclass dummyStuff:\n def __init__(self):\n self.video_path = \"videos/sample.mp4\"\n self.output_path = \"videoImages/\"\n self.display = False\n #self.resolution = 1080\n\n\nargs = dummyStuff()\n\n# Set the args\nvideoPath = args.video_path\noutputPath = args.output_path\ndisplay = args.display\n# dim = (256, 144) #Only works for 16:9 aspect ration\n\n# Open the video\ncap = cv2.VideoCapture(videoPath)\n\n# Grab the number of frames we expect\nlength = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\nwidth = int(cv2.CAP_PROP_FRAME_WIDTH)\nheight = int(cv2.CAP_PROP_FRAME_HEIGHT)\n# Normally not an int, but here we extract a frame a second!\nfps = int(cap.get(cv2.CAP_PROP_FPS))", "target_code": "if(cap.isOpened() == False):\n raise RuntimeError(\"Error opening the videostream or file!\\n\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Document Details\n# ### Author: Eric Medwedeff\n# ### Date: 4/2/2019\n# ### Program Name: Video2Image\n#\n# ### Details:\n# This program reads in a video from the user specified directory and writes the output to the user specified directory.\n\n# ## First Step is to get our Imports\n\n\nimport cv2 # For video processing\nimport numpy as np # For numerical tasks\nimport logging # For tracking purposes to be added later\nimport sys # For our nifty print\n\n\n# ### A nifty little printer to make our output tidy and some helper function(s)\n\n\ndef printer(index, length):\n sys.stdout.write(f\"\\rProcessing sample {index+1} of {length}\")\n sys.stdout.flush()\n\n\ndef saveImage(outputPath, frame, count):\n outputNamePath = (\"{}{}.jpeg\").format(outputPath, str(count).zfill(6))\n cv2.imwrite(outputNamePath, frame)\n\n\n# ## Second step is to load our args and the video\n\n\nclass dummyStuff:\n def __init__(self):\n self.video_path = \"videos/sample.mp4\"\n self.output_path = \"videoImages/\"\n self.display = False\n #self.resolution = 1080\n\n\nargs = dummyStuff()\n\n# Set the args\nvideoPath = args.video_path\noutputPath = args.output_path\ndisplay = args.display\n# dim = (256, 144) #Only works for 16:9 aspect ration\n\n# Open the video\ncap = cv2.VideoCapture(videoPath)\n\n# Grab the number of frames we expect\nlength = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\nwidth = int(cv2.CAP_PROP_FRAME_WIDTH)\nheight = int(cv2.CAP_PROP_FRAME_HEIGHT)\n# Normally not an int, but here we extract a frame a second!\nfps = int(cap.get(cv2.CAP_PROP_FPS))\n", "project_metadata": {"full_name": "ejm930/UCI-Data-Science-Initiative-ComputerVision-Tutorial", "description": "Tutorial UCI's opencv Data Science Initiative", "topics": [], "git_url": "git://github.com/ejm930/UCI-Data-Science-Initiative-ComputerVision-Tutorial.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2019-05-08T06:04:55Z", "size": 42169, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10304104}, "last_updated": "2019-05-12T01:07:23Z"}, "intent": "# Check if the video opened successfully otherwise exit"}, {"original_comment": "# You can drop all the missing data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Spark DataFrame\n\n#%%\n\nfrom pyspark.sql.functions import countDistinct, avg, stddev\nfrom pyspark.sql.functions import (dayofmonth, hour,\n dayofyear, month,\n year, weekofyear,\n format_number, date_format)\nfrom pyspark.sql.functions import mean\nfrom pyspark.sql.functions import format_number\nfrom pyspark.sql.types import (StructField, StringType,\n IntegerType, StructType)\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/jubinsoni/spark-2.1.0-bin-hadoop2.7')\n\n#%%\n\nspark = SparkSession.builder.appName('Basics').getOrCreate()\n\n#%%\n\ndf = spark.read.json('people.json')\n\n#%%\n\ndf.show()\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.printSchema()\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.describe()\n\n#%%\n\ndf.describe().show()\n\n#%%\n\n\n\n#%%\n\n# age: name of variable, type of variable, True/False: if it can contain null\ndata_schema = [StructField('age', IntegerType(), True),\n StructField('name', StringType(), True)]\n\n#%%\n\ndata_schema\n\n#%%\n\nfinal_struc = StructType(fields=data_schema)\n\n#%%\n\nfinal_struc\n\n#%%\n\ndf = spark.read.json('people.json', schema=final_struc)\n\n#%%\n\ndf.printSchema()\n# Now it has corrected the age as integer instead of long\n\n#%%\n\ndf.show()\n\n#%%\n\ndf['age']\n\n#%%\n\ntype(df['age'])\n\n#%%\n\n# If you want to select a dataframe column\ndf.select('age')\n\n#%%\n\n# If you want to view the results of dataframe column\ndf.select('age').show()\n\n#%%\n\n# If you want to view selected rows of dataframe\ndf.head(2)\n\n#%%\n\ndf.head(2)[0]\n\n#%%\n\ndf.head(2)[0]['name']\n\n#%%\n\ntype(df.head(2)[0])\n\n#%%\n\ntype(df.head(2)[0]['name'])\n\n#%%\n\n# The reason there are so many datatypes and objects in Spark because of its ability\n# to read from a distributed source and map that out to compute\n\n#%%\n\n# selecting multiple columns\ndf.select(['age', 'name'])\n\n#%%\n\ndf.select(['age', 'name']).show()\n\n#%%\n\n# Creating new columns\ndf.withColumn('newAge', df['age']).show()\n\n#%%\n\n# Creating new columns\ndf.withColumn('doubleAge', df['age']*2).show()\n\n#%%\n\n# The above operations are not inplace so everytime you make\n# any changes you have to save them to a new variable\n\n#%%\n\n# Renaming a column\ndf.withColumnRenamed('age', 'my_new_age').show()\n\n\n# ## Working with Spark SQL queries\n\n#%%\n\n# In order to work with SQL queries we need to register spark\n# As a temporary SQL view\n# We do CreateOrReplace incase we have already existing view it replaces\n\ndf.createOrReplaceTempView('people')\n\n#%%\n\n# Now you can pass direct SQL queries\nresults = spark.sql(\"SELECT * FROM people\")\n\n#%%\n\nresults.show()\n\n#%%\n\nage_results = spark.sql(\"SELECT * FROM people WHERE age=30\")\n\n#%%\n\nage_results.show()\n\n\n# ## Working with JSON, CSV and other formats\n\n#%%\n\n\n\n#%%\n\nspark = SparkSession.builder.appName('ops').getOrCreate()\n\n#%%\n\n# infer schema option is available with CSV\n\ndf = spark.read.csv('appl_stock.csv', inferSchema=True, header=True)\n\n#%%\n\ndf.printSchema()\n\n#%%\n\ndf.head(3)\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'df.head(1)[0]')\n\n#%%\n\ndf.show()\n\n#%%\n\n# You can pass in conditions like\ndf.filter('Close < 500').show()\n\n#%%\n\n# Opening price of every stock where Close price is less than 500\ndf.filter('Close < 500').select('Open').show()\n\n#%%\n\ndf.filter('Close < 500').select(['Open', 'Close']).show()\n\n#%%\n\ndf.filter(df['Close'] < 500).show()\n\n#%%\n\ndf.filter(df['Close'] < 500).select('Volume').show()\n\n#%%\n\n# Multiple conditions in spark dataframe\ndf.filter((df['Close'] < 200) & (df['Open'] > 200)).show()\n\n#%%\n\n# Not operator\ndf.filter((df['Close'] < 200) & ~(df['Open'] > 200)).show()\n\n#%%\n\ndf.filter(df['Low'] == 197.16).show()\n\n#%%\n\n# The above results look very messy and lot of times we want\n# to collect the data to work with\ndf.filter(df['Low'] == 197.16).collect()\n\n#%%\n\nresult = df.filter(df['Low'] == 197.16).collect()\n\n#%%\n\nresult[0]\n\n#%%\n\nrow = result[0]\n\nrow.asDict()\n\n#%%\n\nrow.asDict()['Volume']\n\n#%%\n\nresult[0]['Volume']\n\n\n# ## GroupBy, Aggregate and Sorting Functions\n\n#%%\n\nspark = SparkSession.builder.appName('aggs').getOrCreate()\n\n#%%\n\ndf = spark.read.csv('sales_info.csv', inferSchema=True, header=True)\n\n#%%\n\ndf.count()\n\n#%%\n\ndf.printSchema()\n\n#%%\n\ndf.show()\n\n#%%\n\ndf.groupBy('Company').count().show()\n\n#%%\n\ndf.groupBy('Company').mean().show()\n\n#%%\n\ndf.groupBy('Company').max().show()\n\n#%%\n\ndf.groupBy('Company').min().show()\n\n#%%\n\n# May be you dont need group by, you need something\n# like average sales per company\n\ndf.agg({'Sales': 'sum'}).show()\n\n#%%\n\ndf.agg({'Sales': 'max', 'Sales': 'min'}).show()\n\n#%%\n\ngroup_data = df.groupBy('Company')\n\n#%%\n\ngroup_data\n\n#%%\n\n# The agg method done using groupby\ngroup_data.agg({'Sales': 'max'}).show()\n\n#%%\n\n\n\n#%%\n\n# Counts distinct sales value\ndf.select(countDistinct('Sales')).show()\n\n#%%\n\ndf.select(avg('Sales')).show()\n\n#%%\n\n# Alias - Give the column name appropriate name\ndf.select(avg('Sales').alias('Average Sales')).show()\n\n#%%\n\ndf.select(stddev('Sales')).show()\n\n#%%\n\n# Formatting the number\n\n#%%\n\nsales_std = df.select(stddev('Sales').alias('std'))\n\n#%%\n\nsales_std.show()\n\n#%%\n\n# number of decimal places I want to show\nsales_std.select(format_number('std', 2)).show()\n\n#%%\n\n# Again chaining name here\nsales_std.select(format_number('std', 2).alias('std')).show()\n\n#%%\n\n# Ordering the columns, default is ascending order\n\ndf.orderBy('Sales').show()\n\n#%%\n\n#Ordering in descending\ndf.orderBy(df['Sales'].desc()).show()\n\n\n# ## Dealing with Missing Data in Spark\n\n#%%\n\nspark = SparkSession.builder.appName('miss').getOrCreate()\n\n#%%\n\ndf = spark.read.csv('ContainsNull.csv', header=True, inferSchema=True)\n\n#%%\n\ndf.show()\n\n#%%", "target_code": "df.na.drop().show()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Spark DataFrame\n\n\nfrom pyspark.sql.functions import countDistinct, avg, stddev\nfrom pyspark.sql.functions import (dayofmonth, hour,\n dayofyear, month,\n year, weekofyear,\n format_number, date_format)\nfrom pyspark.sql.functions import mean\nfrom pyspark.sql.functions import format_number\nfrom pyspark.sql.types import (StructField, StringType,\n IntegerType, StructType)\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/jubinsoni/spark-2.1.0-bin-hadoop2.7')\n\n\nspark = SparkSession.builder.appName('Basics').getOrCreate()\n\n\ndf = spark.read.json('people.json')\n\n\ndf.show()\n\n\ndf.head()\n\n\ndf.printSchema()\n\n\ndf.columns\n\n\ndf.describe()\n\n\ndf.describe().show()\n\n\n\n\n\n# age: name of variable, type of variable, True/False: if it can contain null\ndata_schema = [StructField('age', IntegerType(), True),\n StructField('name', StringType(), True)]\n\n\ndata_schema\n\n\nfinal_struc = StructType(fields=data_schema)\n\n\nfinal_struc\n\n\ndf = spark.read.json('people.json', schema=final_struc)\n\n\ndf.printSchema()\n# Now it has corrected the age as integer instead of long\n\n\ndf.show()\n\n\ndf['age']\n\n\ntype(df['age'])\n\n\n# If you want to select a dataframe column\ndf.select('age')\n\n\n# If you want to view the results of dataframe column\ndf.select('age').show()\n\n\n# If you want to view selected rows of dataframe\ndf.head(2)\n\n\ndf.head(2)[0]\n\n\ndf.head(2)[0]['name']\n\n\ntype(df.head(2)[0])\n\n\ntype(df.head(2)[0]['name'])\n\n\n# The reason there are so many datatypes and objects in Spark because of its ability\n# to read from a distributed source and map that out to compute\n\n\n# selecting multiple columns\ndf.select(['age', 'name'])\n\n\ndf.select(['age', 'name']).show()\n\n\n# Creating new columns\ndf.withColumn('newAge', df['age']).show()\n\n\n# Creating new columns\ndf.withColumn('doubleAge', df['age']*2).show()\n\n\n# The above operations are not inplace so everytime you make\n# any changes you have to save them to a new variable\n\n\n# Renaming a column\ndf.withColumnRenamed('age', 'my_new_age').show()\n\n\n# ## Working with Spark SQL queries\n\n\n# In order to work with SQL queries we need to register spark\n# As a temporary SQL view\n# We do CreateOrReplace incase we have already existing view it replaces\n\ndf.createOrReplaceTempView('people')\n\n\n# Now you can pass direct SQL queries\nresults = spark.sql(\"SELECT * FROM people\")\n\n\nresults.show()\n\n\nage_results = spark.sql(\"SELECT * FROM people WHERE age=30\")\n\n\nage_results.show()\n\n\n# ## Working with JSON, CSV and other formats\n\n\n\n\n\nspark = SparkSession.builder.appName('ops').getOrCreate()\n\n\n# infer schema option is available with CSV\n\ndf = spark.read.csv('appl_stock.csv', inferSchema=True, header=True)\n\n\ndf.printSchema()\n\n\ndf.head(3)\n\n\nget_ipython().run_cell_magic('time', '', 'df.head(1)[0]')\n\n\ndf.show()\n\n\n# You can pass in conditions like\ndf.filter('Close < 500').show()\n\n\n# Opening price of every stock where Close price is less than 500\ndf.filter('Close < 500').select('Open').show()\n\n\ndf.filter('Close < 500').select(['Open', 'Close']).show()\n\n\ndf.filter(df['Close'] < 500).show()\n\n\ndf.filter(df['Close'] < 500).select('Volume').show()\n\n\n# Multiple conditions in spark dataframe\ndf.filter((df['Close'] < 200) & (df['Open'] > 200)).show()\n\n\n# Not operator\ndf.filter((df['Close'] < 200) & ~(df['Open'] > 200)).show()\n\n\ndf.filter(df['Low'] == 197.16).show()\n\n\n# The above results look very messy and lot of times we want\n# to collect the data to work with\ndf.filter(df['Low'] == 197.16).collect()\n\n\nresult = df.filter(df['Low'] == 197.16).collect()\n\n\nresult[0]\n\n\nrow = result[0]\n\nrow.asDict()\n\n\nrow.asDict()['Volume']\n\n\nresult[0]['Volume']\n\n\n# ## GroupBy, Aggregate and Sorting Functions\n\n\nspark = SparkSession.builder.appName('aggs').getOrCreate()\n\n\ndf = spark.read.csv('sales_info.csv', inferSchema=True, header=True)\n\n\ndf.count()\n\n\ndf.printSchema()\n\n\ndf.show()\n\n\ndf.groupBy('Company').count().show()\n\n\ndf.groupBy('Company').mean().show()\n\n\ndf.groupBy('Company').max().show()\n\n\ndf.groupBy('Company').min().show()\n\n\n# May be you dont need group by, you need something\n# like average sales per company\n\ndf.agg({'Sales': 'sum'}).show()\n\n\ndf.agg({'Sales': 'max', 'Sales': 'min'}).show()\n\n\ngroup_data = df.groupBy('Company')\n\n\ngroup_data\n\n\n# The agg method done using groupby\ngroup_data.agg({'Sales': 'max'}).show()\n\n\n\n\n\n# Counts distinct sales value\ndf.select(countDistinct('Sales')).show()\n\n\ndf.select(avg('Sales')).show()\n\n\n# Alias - Give the column name appropriate name\ndf.select(avg('Sales').alias('Average Sales')).show()\n\n\ndf.select(stddev('Sales')).show()\n\n\n# Formatting the number\n\n\nsales_std = df.select(stddev('Sales').alias('std'))\n\n\nsales_std.show()\n\n\n# number of decimal places I want to show\nsales_std.select(format_number('std', 2)).show()\n\n\n# Again chaining name here\nsales_std.select(format_number('std', 2).alias('std')).show()\n\n\n# Ordering the columns, default is ascending order\n\ndf.orderBy('Sales').show()\n\n\n#Ordering in descending\ndf.orderBy(df['Sales'].desc()).show()\n\n\n# ## Dealing with Missing Data in Spark\n\n\nspark = SparkSession.builder.appName('miss').getOrCreate()\n\n\ndf = spark.read.csv('ContainsNull.csv', header=True, inferSchema=True)\n\n\ndf.show()\n\n", "project_metadata": {"full_name": "jubins/Spark-And-MLlib-Projects", "description": "This repository contains Spark, MLlib, PySpark and Dataframes projects", "topics": ["spark", "mllib", "spark-ml", "sparksql", "spark-streaming", "spark-dataframes", "pyspark", "python", "aws-ec2"], "git_url": "git://github.com/jubins/Spark-And-MLlib-Projects.git", "stars": 25, "watchers": 25, "forks": 70, "created": "2017-10-22T04:00:10Z", "size": 103, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 188848}, "last_updated": "2020-12-27T11:31:03Z"}, "intent": "# drop all the missing data"}, {"original_comment": "# ### Check data types\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Wrangling\n# In the very begining we need to make sure the data is decent for modeling.\n\n#%%\n\n# data analysis pkg\nimport os\nimport warnings\nfrom tqdm import tqdm as progressbar\nfrom wordcloud import WordCloud\nfrom bidi.algorithm import get_display\nimport arabic_reshaper\nimport re\nfrom hazm import word_tokenize, stopwords_list, InformalLemmatizer\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set()\n\n# data wrangling, cleaning and wordcloud visualization pkg\n\n\n# Avoiding warnings\n########### Prevent Warnings ###########\nwarnings.filterwarnings(action='ignore')\n########### Prevent Warnings ###########\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Read data\n\n#%%\n\ndf = pd.read_csv('data/original/train.csv')\n\n#%%\n\ndf.head()\n\n\n# #### All data is from unique ids?\n\n#%%\n\ndf.id.nunique()\n\n\n# - Yes they are.\n\n# ## Do we have null data?\n\n#%%\n\ndf.isnull().sum()\n\n\n# ### Deleting rows with both comment and title set as null\n# its an NLP project and texts are most valuable data we have.\n# So we delete rows which have null in both comment and title\n\n#%%\n\ndf = df[~((df.title.isnull()) & (df.comment.isnull()))]\ndf = df.reset_index(drop=True)", "target_code": "df.dtypes\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Wrangling\n# In the very begining we need to make sure the data is decent for modeling.\n\n\n# data analysis pkg\nimport os\nimport warnings\nfrom tqdm import tqdm as progressbar\nfrom wordcloud import WordCloud\nfrom bidi.algorithm import get_display\nimport arabic_reshaper\nimport re\nfrom hazm import word_tokenize, stopwords_list, InformalLemmatizer\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set()\n\n# data wrangling, cleaning and wordcloud visualization pkg\n\n\n# Avoiding warnings\n########### Prevent Warnings ###########\nwarnings.filterwarnings(action='ignore')\n########### Prevent Warnings ###########\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Read data\n\n\ndf = pd.read_csv('data/original/train.csv')\n\n\ndf.head()\n\n\n# #### All data is from unique ids?\n\n\ndf.id.nunique()\n\n\n# - Yes they are.\n\n# ## Do we have null data?\n\n\ndf.isnull().sum()\n\n\n# ### Deleting rows with both comment and title set as null\n# its an NLP project and texts are most valuable data we have.\n# So we delete rows which have null in both comment and title\n\n\ndf = df[~((df.title.isnull()) & (df.comment.isnull()))]\ndf = df.reset_index(drop=True)\n\n\n\n", "project_metadata": {"full_name": "masouduut94/Digikala_comments_verification", "description": "Digikala online market has recently published some open source data in various categories. Since I always wanted to do some NLP project, so I thought of some useful tutorials in python for newcomers. I really hope this could be useful for you guys. I still keep updating the package and also will share the link of video and article related to this post soon!", "topics": ["digikala", "gensim", "tutorials", "digikalanext-open-datasets", "natural-language-processing", "machine-learning", "keras", "comment-verification", "text-processing", "text-classification", "text-mining"], "git_url": "git://github.com/masouduut94/Digikala_comments_verification.git", "stars": 6, "watchers": 6, "forks": 2, "created": "2019-11-29T15:44:25Z", "size": 30687, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4673551}, "last_updated": "2021-01-06T14:11:00Z"}, "intent": "# Check data types"}, {"original_comment": " # Initialize all the variables.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport numpy as np\nimport tensorflow as tf\nimport tensorflow.python.platform\nimport matplotlib.pyplot as plt\nimport matplotlib\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Global variables.\nNUM_LABELS = 2 # The number of labels.\nBATCH_SIZE = 100 # The number of training examples to use per training step.\n\n#%%\n\n# Helper function in place of original xrange() in python2\ndef xrange(x):\n return iter(range(x))\n\n# Read the data from the given data file and extract the features and labels\n\n\ndef csv_data_reader(filename):\n # Arrays to hold the labels and features.\n label = []\n features = []\n with open(filename, 'r') as datafile:\n for line in datafile:\n row = line.split(\",\")\n label.append(int(row[2]))\n features.append([float(x) for x in row[0:2]])\n\n features_matrix = np.matrix(features).astype(np.float32)\n labels_vect = np.array(label).astype(dtype=np.uint8)\n labels_onehot = (np.arange(NUM_LABELS) ==\n labels_vect[:, None]).astype(np.float32)\n # return the features(1,2) and label\n return features_matrix, labels_onehot\n\n# Init weights method. (Reference : Delip Rao: http://deliprao.com/archives/100)\n\n\ndef weight_initializer(shape, init_method='xavier', xavier_params=(None, None)):\n if init_method == 'zeros':\n return tf.Variable(tf.zeros(shape, dtype=tf.float32))\n elif init_method == 'uniform':\n return tf.Variable(tf.random_normal(shape, stddev=0.01, dtype=tf.float32))\n else: # xavier\n (fan_in, fan_out) = xavier_params\n low = -4*np.sqrt(6.0/(fan_in + fan_out)) # {sigmoid:4, tanh:1}\n high = 4*np.sqrt(6.0/(fan_in + fan_out))\n return tf.Variable(tf.random_uniform(shape, minval=low, maxval=high, dtype=tf.float32))\n\n\ndef predictor(x, w_hidden, b_hidden, w_out, b_out):\n hidden = tf.nn.tanh(tf.matmul(tf.cast(x, tf.float32), w_hidden) + b_hidden)\n y = tf.nn.softmax(tf.matmul(tf.cast(hidden, tf.float32), w_out) + b_out)\n pred_result = tf.argmax(y, 1)\n return pred_result.eval()\n\n#%%\n\ntraining_data_fname = \"data/intro_to_ann3.csv\"\ntest_data_fname = \"data/intro_to_ann3.csv\"\n\n# Extract the csv data into numpy arrays.\ntrain_data, train_labels = csv_data_reader(training_data_fname)\ntest_data, test_labels = csv_data_reader(test_data_fname)\ntrain_size, num_features = train_data.shape\nnum_epochs = 3000\nnum_hidden = 10\n\n# The below place holders hold the features and label data\n# that will be used by the program later\nx = tf.placeholder(\"float\", shape=[None, num_features])\ny_ = tf.placeholder(\"float\", shape=[None, NUM_LABELS])\n\ntest_data_node = tf.constant(test_data)\n\n#%%\n\n# Construct Phase\n# Hidden weights and bias initialization\nw_hidden = weight_initializer(\n [num_features, num_hidden],\n 'uniform',\n xavier_params=(num_features, num_hidden))\n\nb_hidden = weight_initializer([1, num_hidden], 'zeros')\n# Construct the hidden layers\nhidden = tf.nn.tanh(tf.matmul(x, w_hidden) + b_hidden)\n# output weights and bias initialization\nw_out = weight_initializer(\n [num_hidden, NUM_LABELS],\n 'uniform',\n xavier_params=(num_hidden, NUM_LABELS))\nb_out = weight_initializer([1, NUM_LABELS], 'zeros')\n# Construct the output layer\ny = tf.nn.softmax(tf.matmul(hidden, w_out) + b_out)\nmodel = tf.initialize_all_variables()\n\n#%%\n\n# Optimization.\ncross_entropy = -tf.reduce_sum(y_*tf.log(y))\ntrain_step = tf.train.AdamOptimizer(0.1).minimize(cross_entropy)\n# Verification Phase\ncorrect_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))\naccuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n\n#%%\n\n# Execution Phase.\nwith tf.Session() as sess:", "target_code": " tf.initialize_all_variables().run()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport numpy as np\nimport tensorflow as tf\nimport tensorflow.python.platform\nimport matplotlib.pyplot as plt\nimport matplotlib\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Global variables.\nNUM_LABELS = 2 # The number of labels.\nBATCH_SIZE = 100 # The number of training examples to use per training step.\n\n\n# Helper function in place of original xrange() in python2\ndef xrange(x):\n return iter(range(x))\n\n# Read the data from the given data file and extract the features and labels\n\n\ndef csv_data_reader(filename):\n # Arrays to hold the labels and features.\n label = []\n features = []\n with open(filename, 'r') as datafile:\n for line in datafile:\n row = line.split(\",\")\n label.append(int(row[2]))\n features.append([float(x) for x in row[0:2]])\n\n features_matrix = np.matrix(features).astype(np.float32)\n labels_vect = np.array(label).astype(dtype=np.uint8)\n labels_onehot = (np.arange(NUM_LABELS) ==\n labels_vect[:, None]).astype(np.float32)\n # return the features(1,2) and label\n return features_matrix, labels_onehot\n\n# Init weights method. (Reference : Delip Rao: http://deliprao.com/archives/100)\n\n\ndef weight_initializer(shape, init_method='xavier', xavier_params=(None, None)):\n if init_method == 'zeros':\n return tf.Variable(tf.zeros(shape, dtype=tf.float32))\n elif init_method == 'uniform':\n return tf.Variable(tf.random_normal(shape, stddev=0.01, dtype=tf.float32))\n else: # xavier\n (fan_in, fan_out) = xavier_params\n low = -4*np.sqrt(6.0/(fan_in + fan_out)) # {sigmoid:4, tanh:1}\n high = 4*np.sqrt(6.0/(fan_in + fan_out))\n return tf.Variable(tf.random_uniform(shape, minval=low, maxval=high, dtype=tf.float32))\n\n\ndef predictor(x, w_hidden, b_hidden, w_out, b_out):\n hidden = tf.nn.tanh(tf.matmul(tf.cast(x, tf.float32), w_hidden) + b_hidden)\n y = tf.nn.softmax(tf.matmul(tf.cast(hidden, tf.float32), w_out) + b_out)\n pred_result = tf.argmax(y, 1)\n return pred_result.eval()\n\n\ntraining_data_fname = \"data/intro_to_ann3.csv\"\ntest_data_fname = \"data/intro_to_ann3.csv\"\n\n# Extract the csv data into numpy arrays.\ntrain_data, train_labels = csv_data_reader(training_data_fname)\ntest_data, test_labels = csv_data_reader(test_data_fname)\ntrain_size, num_features = train_data.shape\nnum_epochs = 3000\nnum_hidden = 10\n\n# The below place holders hold the features and label data\n# that will be used by the program later\nx = tf.placeholder(\"float\", shape=[None, num_features])\ny_ = tf.placeholder(\"float\", shape=[None, NUM_LABELS])\n\ntest_data_node = tf.constant(test_data)\n\n\n# Construct Phase\n# Hidden weights and bias initialization\nw_hidden = weight_initializer(\n [num_features, num_hidden],\n 'uniform',\n xavier_params=(num_features, num_hidden))\n\nb_hidden = weight_initializer([1, num_hidden], 'zeros')\n# Construct the hidden layers\nhidden = tf.nn.tanh(tf.matmul(x, w_hidden) + b_hidden)\n# output weights and bias initialization\nw_out = weight_initializer(\n [num_hidden, NUM_LABELS],\n 'uniform',\n xavier_params=(num_hidden, NUM_LABELS))\nb_out = weight_initializer([1, NUM_LABELS], 'zeros')\n# Construct the output layer\ny = tf.nn.softmax(tf.matmul(hidden, w_out) + b_out)\nmodel = tf.initialize_all_variables()\n\n\n# Optimization.\ncross_entropy = -tf.reduce_sum(y_*tf.log(y))\ntrain_step = tf.train.AdamOptimizer(0.1).minimize(cross_entropy)\n# Verification Phase\ncorrect_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))\naccuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n\n\n# Execution Phase.\nwith tf.Session() as sess:\n", "project_metadata": {"full_name": "ml6973/Course", "description": null, "topics": [], "git_url": "git://github.com/ml6973/Course.git", "stars": 20, "watchers": 20, "forks": 25, "created": "2016-08-21T04:15:31Z", "size": 105989, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 13560097, "Python": 453544}, "last_updated": "2020-05-12T13:40:27Z"}, "intent": " # Initialize all the variables."}, {"original_comment": " # store results in one df:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# In this notebook different autoencoder architectures are implemented: Each architecture is based on an encoding-module and a prediction module. The window size refers to how many timesteps are used to make a prediction. \"Multivariate\" models process input data of multiple areas at the same time to predict demand.\n\n# # Import Modules\n\n#%%\n\nimport tensorflow as tf\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\nimport matplotlib.dates as mdates\n\nimport datetime\n\nimport sklearn.preprocessing\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import StandardScaler\n\nimport itertools\n\nimport os\n\n#from tqdm import tqdm\n\nfrom keras.models import Model\nfrom keras.models import Sequential\nfrom keras.layers import LSTM\nfrom keras.layers import Dense\nfrom keras.layers import Input\nfrom keras.layers import TimeDistributed\nfrom keras.layers import RepeatVector\nfrom keras.layers import Concatenate\nfrom keras.layers import Dropout\n\nfrom keras import optimizers\nfrom keras.models import model_from_json\nfrom keras import regularizers\n\nfrom keras.callbacks import EarlyStopping\n\nimport save_files_collection as sf\n\n#%%\n\nfrom keras import backend as K\nK.tensorflow_backend._get_available_gpus()\n\n#%%\n\nsess = tf.Session(config=tf.ConfigProto(log_device_placement=True))\n\n#%%\n\n# load data\nTRAIN_PATH = '/media/...'\nStore_PATH = '/media/...'\nfile_name = 'ts_10largest.csv'\nts_10largest = pd.read_csv(TRAIN_PATH + file_name,\n header=0, parse_dates=['date'], index_col='date')\n\n\n# # Data Preparation\n\n#%%\n\n''' Time Series Preparation '''\n\n# Preprocessing Steps: 1) Differencing 2) Scaling 3) Supervised_Sequence\n\n# make timeseries stationary\n\n\ndef differencing(dataset, shift_interval):\n shifted = dataset.shift(shift_interval).copy(deep=True)\n diff_series = dataset - shifted\n diff_series.dropna(inplace=True)\n return diff_series\n\n\ndef invert_differencing(history, predictions, year_to_access):\n history_copy = history.copy()\n history_shift = history_copy.shift(1)\n history_shift.dropna(inplace=True)\n preds_all = predictions + history_shift.loc[year_to_access]\n\n return preds_all\n\n#%%\n\n# it is recommended to scale input data in the range of the activation function used by the network. Since we use \"tanh\"..\n# scale datasets based on fit of train_set; input set has to be a reshaped numpy_array!\ndef data_scaler(train_data, valid_data, test_data, scale_range, standardizing_flag=False):\n\n if standardizing_flag == False:\n print('MinMax-Scaling used...')\n # use MinMax-Scaling based on scale_range given...\n # feature_range=(-1, 1)\n scaler = MinMaxScaler(feature_range=scale_range)\n scaler = scaler.fit(train_data)\n # scale train_set:\n train_scaled = scaler.transform(train_data)\n train_scaled_df = pd.DataFrame(train_scaled)\n # scale valididation data based on scaler fitted on training data:\n valid_scaled = scaler.transform(valid_data)\n valid_scaled_df = pd.DataFrame(valid_scaled)\n # scale test_set based on scaler fitted on training data:\n test_scaled = scaler.transform(test_data)\n test_scaled_df = pd.DataFrame(test_scaled)\n\n else:\n print('Standardizing used...')\n\n scaler = StandardScaler()\n scaler = scaler.fit(train_data)\n # scale train_set:\n train_scaled = scaler.transform(train_data)\n train_scaled_df = pd.DataFrame(train_scaled)\n # scale valididation data based on scaler fitted on training data:\n valid_scaled = scaler.transform(valid_data)\n valid_scaled_df = pd.DataFrame(valid_scaled)\n # scale test_set based on scaler fitted on training data:\n test_scaled = scaler.transform(test_data)\n test_scaled_df = pd.DataFrame(test_scaled)\n\n # return data as df, since \"supervised\" function requires df or series as input\n return scaler, train_scaled_df, valid_scaled_df, test_scaled_df\n\n\n# function to invert scaling:\ndef invert_data_scaler(scaler, predictions_data, standardizing_flag, scale_range, n_preds):\n\n if standardizing_flag == False:\n # create new scaler: -> necessary since old scaler might be fitted on lagged values -> expects more dimensions as the predictions have!\n new_scaler = MinMaxScaler(feature_range=scale_range)\n # copy attributes of old scaler to new one: but only for the first columns for which we have real predictions\n new_scaler.min_, new_scaler.scale_ = scaler.min_[\n :n_preds], scaler.scale_[:n_preds]\n\n preds = new_scaler.inverse_transform(predictions_data)\n\n else:\n new_scaler = StandardScaler()\n # copy attributes of old scaler to new one: but only for the first columns for which we have real predictions\n new_scaler.mean_, new_scaler.scale_ = scaler.mean_[\n :n_preds], scaler.scale_[:n_preds]\n\n preds = new_scaler.inverse_transform(predictions_data)\n\n return preds\n\n#%%\n\ndef create_supervised_data_single_ts(single_ts, n_timesteps):\n # copy dfs to ensure no changes are made on original dataset:\n sequence_copy = single_ts.copy(deep=True)\n # create dfs to easily append new columns and access columns:\n sequence_copy = pd.DataFrame(sequence_copy)\n sequence_df = pd.DataFrame(sequence_copy)\n\n # Note: range starts reversed to make sure the lags are in correct order\n for i in range(n_timesteps, 0, -1):\n sequence_df['lag_{}'.format(i)] = sequence_copy.iloc[:, 0].shift(i)\n\n # drop rows with NaNs -> if no lagged features are available, we drop the whole row\n sequence_df.dropna(inplace=True)\n\n return sequence_df\n\n#%%\n\ndef create_supervised_UBER_data_single_ts(single_ts, n_timesteps_T, n_timesteps_F):\n # copy dfs to ensure no changes are made on original dataset:\n sequence_copy = single_ts.copy(deep=True)\n # create dfs to easily append new columns and access columns:\n sequence_copy = pd.DataFrame(sequence_copy)\n sequence_df = pd.DataFrame(sequence_copy)\n\n # Note: range starts reversed to make sure the lags are in correct order\n for i in range(n_timesteps_T, 0, -1):\n sequence_df['lag_T{}'.format(i)] = sequence_copy.iloc[:, 0].shift(i)\n for i in range(n_timesteps_F, 0, -1):\n sequence_df['prev_lag_F{}'.format(\n i)] = sequence_copy.iloc[:, 0].shift(i)\n # Note: for future values range starts \"normal\":\n for i in range(1, n_timesteps_F+1):\n sequence_df['fut_{}'.format(i)] = sequence_copy.iloc[:, 0].shift(-i)\n\n # drop rows with NaNs -> if no lagged features are available, we drop the whole row\n sequence_df.dropna(inplace=True)\n\n return sequence_df\n\n\n# # preprocess data for models:\n\n#%%\n\ndef generate_UBER_data_autoencoder(ts_series, multivariate_flag, last_train_set_year,\n validation_set_year, test_set_year, n_timesteps_T, n_timesteps_F, n_preds,\n scale_range, standardizing_flag):\n # prepare data:\n\n # 1) apply differencing\n ts_diff = differencing(ts_series, 1)\n\n # reassign df:\n ts_series = ts_diff\n\n # 2) get supervised data:\n if multivariate_flag == False:\n # change type of ts_series:\n # -> this way we can access the \"column\" parameter\n ts_series = pd.DataFrame(ts_series)\n\n # prepare dict to store results of each area:\n area_labels = list(ts_series.columns)\n area_supervised_dict = {}\n scaler_list = []\n\n # get data for each area:\n for i in range(len(area_labels)):\n print('data for area{} is prepared...'.format(area_labels[i]))\n # create key:\n area_supervised_dict['area{}'.format(area_labels[i])] = []\n # get supervised data:\n supervised_data_df = create_supervised_UBER_data_single_ts(\n ts_series.iloc[:, i], n_timesteps_T, n_timesteps_F)\n\n # get train/test split:\n ts_train_all, ts_test = supervised_data_df.loc[:\n validation_set_year], supervised_data_df.loc[test_set_year]\n # get train/validation Split:\n ts_train, ts_valid = ts_train_all.loc[:\n last_train_set_year], ts_train_all.loc[validation_set_year]\n\n # scale data:\n # create numpy arrays for scaler:\n ts_train_array = ts_train.values\n ts_valid_array = ts_valid.values\n ts_test_array = ts_test.values\n\n print('Data is scaled...')\n scaler, train_scaled, valid_scaled, test_scaled = data_scaler(\n ts_train_array, ts_valid_array, ts_test_array, scale_range, standardizing_flag)\n\n # store scaler:\n scaler_list.append(scaler)\n\n # restore index:\n train_scaled.index = ts_train.index\n valid_scaled.index = ts_valid.index\n test_scaled.index = ts_test.index\n\n # slice lags_T, lags_F, future_F values:\n # slice lags_T which are actually X:\n X_train = train_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_valid = valid_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_test = test_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n # get y values:\n y_train = train_scaled.iloc[:, 0].values\n y_valid = valid_scaled.iloc[:, 0].values\n y_test = test_scaled.iloc[:, 0].values\n # slice lags_F:\n lags_F_train = train_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n lags_F_valid = valid_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n lags_F_test = test_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n # slice future_F:\n fut_F_train = train_scaled.iloc[:, -(n_timesteps_F):].values\n fut_F_valid = valid_scaled.iloc[:, -(n_timesteps_F):].values\n fut_F_test = test_scaled.iloc[:, -(n_timesteps_F):].values\n\n # reshape X, y and lags_F and fut_F data to easily append other areas later on:\n list_to_reshape_X_lags_F_fut_F = [X_train, X_valid, X_test, lags_F_train,\n lags_F_valid, lags_F_test, fut_F_train, fut_F_valid, fut_F_test]\n list_to_reshape_y = [y_train, y_valid, y_test]\n reshaped_list_X_lags_F_fut_F = []\n reshaped_list_y = []\n # reshape X:\n for array in list_to_reshape_X_lags_F_fut_F:\n # shape: (#n_samples,n_lags,n_areas)\n array = array.reshape((array.shape[0], array.shape[1], 1))\n reshaped_list_X_lags_F_fut_F.append(array)\n\n # reshape y:\n for array in list_to_reshape_y:\n # shape: (#n_samples,n_areas)\n array = array.reshape((array.shape[0], 1))\n reshaped_list_y.append(array)\n\n # append results to dict:\n items_to_append = [reshaped_list_X_lags_F_fut_F[0], reshaped_list_y[0], reshaped_list_X_lags_F_fut_F[3], reshaped_list_X_lags_F_fut_F[6],\n reshaped_list_X_lags_F_fut_F[1], reshaped_list_y[\n 1], reshaped_list_X_lags_F_fut_F[4], reshaped_list_X_lags_F_fut_F[7],\n reshaped_list_X_lags_F_fut_F[2], reshaped_list_y[2], reshaped_list_X_lags_F_fut_F[5], reshaped_list_X_lags_F_fut_F[8]]\n\n for u in range(len(items_to_append)):\n area_supervised_dict['area{}'.format(\n area_labels[i])].append(items_to_append[u])\n\n # print shapes:\n print('shape of X_train single area: ',\n reshaped_list_X_lags_F_fut_F[0].shape)\n print('shape of X_valid single area: ',\n reshaped_list_X_lags_F_fut_F[1].shape)\n print('shape of X_test single area: ',\n reshaped_list_X_lags_F_fut_F[2].shape)\n print('shape of y_train single area: ', reshaped_list_y[0].shape)\n print('shape of y_valid single area: ', reshaped_list_y[1].shape)\n print('shape of y_test single area: ', reshaped_list_y[2].shape)\n print('shape of prev_lags_F_train single area: ',\n reshaped_list_X_lags_F_fut_F[3].shape)\n print('shape of fut_F_values_train single area: ',\n reshaped_list_X_lags_F_fut_F[6].shape)\n\n # concat results of all areas:\n key_list = list(area_supervised_dict.keys())\n\n if multivariate_flag == True:\n # create training set, valid & test set containing inputs of all selected areas: -> append all areas into one big np.array!\n\n # fill arrays with entries of first area:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][8], area_supervised_dict[key_list[0]][9]\n\n lags_F_train = area_supervised_dict[key_list[0]][2]\n lags_F_valid = area_supervised_dict[key_list[0]][6]\n lags_F_test = area_supervised_dict[key_list[0]][10]\n\n fut_F_train = area_supervised_dict[key_list[0]][3]\n fut_F_valid = area_supervised_dict[key_list[0]][7]\n fut_F_test = area_supervised_dict[key_list[0]][11]\n\n # concat remaining areas:\n for i in range(1, len(key_list)):\n X_train = np.concatenate(\n (X_train, area_supervised_dict[key_list[i]][0]), axis=2)\n X_valid = np.concatenate(\n (X_valid, area_supervised_dict[key_list[i]][4]), axis=2)\n X_test = np.concatenate(\n (X_test, area_supervised_dict[key_list[i]][8]), axis=2)\n y_train = np.concatenate(\n (y_train, area_supervised_dict[key_list[i]][1]), axis=1)\n y_valid = np.concatenate(\n (y_valid, area_supervised_dict[key_list[i]][5]), axis=1)\n y_test = np.concatenate(\n (y_test, area_supervised_dict[key_list[i]][9]), axis=1)\n\n lags_F_train = np.concatenate(\n (lags_F_train, area_supervised_dict[key_list[i]][2]), axis=2)\n lags_F_valid = np.concatenate(\n (lags_F_valid, area_supervised_dict[key_list[i]][6]), axis=2)\n lags_F_test = np.concatenate(\n (lags_F_test, area_supervised_dict[key_list[i]][10]), axis=2)\n\n fut_F_train = np.concatenate(\n (fut_F_train, area_supervised_dict[key_list[i]][3]), axis=2)\n fut_F_valid = np.concatenate(\n (fut_F_valid, area_supervised_dict[key_list[i]][7]), axis=2)\n fut_F_test = np.concatenate(\n (fut_F_test, area_supervised_dict[key_list[i]][11]), axis=2)\n\n else:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][8], area_supervised_dict[key_list[0]][9]\n\n lags_F_train = area_supervised_dict[key_list[0]][2]\n lags_F_valid = area_supervised_dict[key_list[0]][6]\n lags_F_test = area_supervised_dict[key_list[0]][10]\n\n fut_F_train = area_supervised_dict[key_list[0]][3]\n fut_F_valid = area_supervised_dict[key_list[0]][7]\n fut_F_test = area_supervised_dict[key_list[0]][11]\n\n print('final shape of X_train: ', X_train.shape)\n\n # return area_supervised_dict\n\n # Note: We only need to return lags_F & fut_F_values for train and valid set since autoencoder is only \"trained\" on these sets..\n return (X_train, y_train, X_valid, y_valid, X_test, y_test, lags_F_train, lags_F_valid, fut_F_train,\n fut_F_valid, scaler_list)\n\n#%%\n\ndef generate_data_autoencoder(ts_series, multivariate_flag, last_train_set_year, validation_set_year, test_set_year,\n n_timesteps, n_preds, scale_range, standardizing_flag):\n # prepare data:\n\n # 1) apply differencing\n ts_diff = differencing(ts_series, 1)\n\n # reassign df:\n ts_series = ts_diff\n\n # 2) get supervised data:\n if multivariate_flag == False:\n # change type of ts_series:\n # -> this way we can access the \"column\" parameter\n ts_series = pd.DataFrame(ts_series)\n\n # prepare dict to store results of each area:\n area_labels = list(ts_series.columns)\n area_supervised_dict = {}\n scaler_list = []\n\n # get data for each area:\n for i in range(len(area_labels)):\n print('data for area{} is prepared...'.format(area_labels[i]))\n # create key:\n area_supervised_dict['area{}'.format(area_labels[i])] = []\n # get supervised data:\n supervised_data_df = create_supervised_data_single_ts(\n ts_series.iloc[:, i], n_timesteps)\n\n # get train/test split:\n ts_train_all, ts_test = supervised_data_df.loc[:\n validation_set_year], supervised_data_df.loc[test_set_year]\n # get train/validation Split:\n ts_train, ts_valid = ts_train_all.loc[:\n last_train_set_year], ts_train_all.loc[validation_set_year]\n\n # scale data:\n # create numpy arrays for scaler:\n ts_train_array = ts_train.values\n ts_valid_array = ts_valid.values\n ts_test_array = ts_test.values\n\n print('Data is scaled...')\n scaler, train_scaled, valid_scaled, test_scaled = data_scaler(\n ts_train_array, ts_valid_array, ts_test_array, scale_range, standardizing_flag)\n\n # store scaler:\n scaler_list.append(scaler)\n\n # restore index:\n train_scaled.index = ts_train.index\n valid_scaled.index = ts_valid.index\n test_scaled.index = ts_test.index\n\n # get X, y pairs:\n X_train = train_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_valid = valid_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_test = test_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n # get y values:\n y_train = train_scaled.iloc[:, 0].values\n y_valid = valid_scaled.iloc[:, 0].values\n y_test = test_scaled.iloc[:, 0].values\n\n # reshape X, y to easily append other areas later on:\n list_to_reshape_X = [X_train, X_valid, X_test]\n list_to_reshape_y = [y_train, y_valid, y_test]\n reshaped_list_X = []\n reshaped_list_y = []\n # reshape X:\n for array in list_to_reshape_X:\n # shape: (#n_samples,n_lags,n_areas)\n array = array.reshape((array.shape[0], array.shape[1], 1))\n reshaped_list_X.append(array)\n\n # reshape y:\n for array in list_to_reshape_y:\n # shape: (#n_samples,n_areas)\n array = array.reshape((array.shape[0], 1))\n reshaped_list_y.append(array)\n\n # append results to dict:\n items_to_append = [reshaped_list_X[0], reshaped_list_y[0],\n reshaped_list_X[1], reshaped_list_y[1],\n reshaped_list_X[2], reshaped_list_y[2]]\n\n for u in range(len(items_to_append)):\n area_supervised_dict['area{}'.format(\n area_labels[i])].append(items_to_append[u])\n\n # print shapes:\n print('shape of X_train single area: ', reshaped_list_X[0].shape)\n print('shape of X_valid single area: ', reshaped_list_X[1].shape)\n print('shape of X_test single area: ', reshaped_list_X[2].shape)\n print('shape of y_train single area: ', reshaped_list_y[0].shape)\n print('shape of y_valid single area: ', reshaped_list_y[1].shape)\n print('shape of y_test single area: ', reshaped_list_y[2].shape)\n\n # concat results of all areas:\n key_list = list(area_supervised_dict.keys())\n\n if multivariate_flag == True:\n # create training set, valid & test set containing inputs of all selected areas: -> append all areas into one big np.array!\n\n # fill arrays with entries of first area:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][2], area_supervised_dict[key_list[0]][3]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n\n # concat remaining areas:\n for i in range(1, len(key_list)):\n X_train = np.concatenate(\n (X_train, area_supervised_dict[key_list[i]][0]), axis=2)\n X_valid = np.concatenate(\n (X_valid, area_supervised_dict[key_list[i]][2]), axis=2)\n X_test = np.concatenate(\n (X_test, area_supervised_dict[key_list[i]][4]), axis=2)\n y_train = np.concatenate(\n (y_train, area_supervised_dict[key_list[i]][1]), axis=1)\n y_valid = np.concatenate(\n (y_valid, area_supervised_dict[key_list[i]][3]), axis=1)\n y_test = np.concatenate(\n (y_test, area_supervised_dict[key_list[i]][5]), axis=1)\n\n else:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][2], area_supervised_dict[key_list[0]][3]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n\n print('final shape of X_train: ', X_train.shape)\n\n return X_train, y_train, X_valid, y_valid, X_test, y_test, scaler_list\n\n\n# # Model Definition\n\n# ## simpler autoencoder with either 1 hidden layer or 2 hidden layers\n\n#%%\n\ndef create_autoencoder_1H(X_train, X_valid, n_timesteps, n_preds, n_hidden1_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n input1 = Input(shape=(X_train.shape[1], X_train.shape[2]))\n encoder = LSTM(n_hidden1_units, activation='tanh',\n dropout=dropout_rate)(input1)\n # encoder layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector which returns the meaningful vector \"n\" times\n # define reconstruct decoder\n # needed to keep the 3D shape (X_train.shape[1])\n encoder2 = RepeatVector(n_timesteps)(encoder)\n decoder = LSTM(n_hidden1_units, activation='tanh', return_sequences=True,\n dropout=dropout_rate)(encoder2) # previously 32units\n decoder = TimeDistributed(Dense(n_preds))(decoder)\n\n autoencoder_model_ts = Model(inputs=input1, outputs=decoder)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit(X_train, X_train, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=(X_valid, X_valid), verbose=1, shuffle=True)\n return history, autoencoder_model_ts\n\n#%%\n\ndef create_stacked_autoencoder_2H(X_train, X_valid, n_timesteps, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n input1 = Input(shape=(X_train.shape[1], X_train.shape[2]))\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input1)\n encoder2 = LSTM(n_hidden2_units, activation='tanh',\n dropout=dropout_rate)(encoder1)\n # encoder2 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder3 = RepeatVector(n_timesteps)(encoder2)\n decoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(encoder3)\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=input1, outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit(X_train, X_train, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=(X_valid, X_valid), verbose=1, shuffle=True)\n return history, autoencoder_model_ts\n\n#%%\n\ndef create_prediction_model_2H(X_train_encoded, y_train, X_valid_encoded, y_valid, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs, shuffle_flag, early_stopping_flag):\n\n # create callback list for different callbacks:\n callback_list = []\n\n if early_stopping_flag == True:\n print('Early Stopping applied')\n # create Callback for EarlyStopping:\n early_stop = EarlyStopping(\n monitor='val_loss', mode='min', verbose=1, patience=20)\n # append callback to list:\n callback_list.append(early_stop)\n\n # define prediction model:\n input2 = Input(shape=(X_train_encoded.shape[1], X_train_encoded.shape[2]))\n lstm_2 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input2)\n lstm_2 = LSTM(n_hidden2_units, activation='tanh',\n return_sequences=False, dropout=dropout_rate)(lstm_2)\n #dense2 = Dense(50)(lstm_2)\n out2 = Dense(n_preds)(lstm_2)\n\n predict_model = Model(input2, out2)\n predict_model.compile(loss='mse', optimizer='adam', metrics=['mae'])\n\n history = predict_model.fit(X_train_encoded, y_train, epochs=n_epochs, batch_size=n_batch_size_training, validation_data=(\n X_valid_encoded, y_valid), verbose=1, shuffle=shuffle_flag, callbacks=callback_list)\n return history, predict_model\n\n\n# ## UBER autoencoder with either 1 hidden layer or 2 hidden layers\n\n#%%\n\n'''Note: There are different ways how the autoencoder can be constructed:\n 1) LSTM LAyers of Encoding Scheme don't use return_sequences -> this way a static embedding is returned after all timesteps are processed, -> a RepeatVector-Layer is needed which creates multiple copies of the static embedding\n 2) LSTM LAyers of Encoding Scheme use return_sequences -> The hidden state of each time step is returned -> we don't need RepeatVector-Layer\n \n --> here: repeatVector used in this Version\n'''\n\n\ndef create_UBER_autoencoder_1H_with_RepeatVect(X_train_T, X_train_F_prev, X_train_F_fut, X_valid_T, X_valid_F_prev, X_valid_F_fut, n_timesteps_F, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n # define input of regular sliding window sequence (previous 't' days)\n input1 = Input(shape=(X_train_T.shape[1], X_train_T.shape[2]))\n # create LSTM layers:\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=False, dropout=dropout_rate)(input1)\n # encoder1 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder2 = RepeatVector(n_timesteps_F)(encoder1)\n # define input of additional sliding window sequence (previous 'F' days of target) -> defined in UBER Paper..\n input2 = Input(shape=(X_train_F_prev.shape[1], X_train_F_prev.shape[2]))\n # create concat layer to add additional sequence input on axis=2:\n concat1 = Concatenate(axis=2)([encoder2, input2])\n # Decoding Module:\n decoder1 = LSTM(n_hidden2_units, activation='tanh', return_sequences=True,\n dropout=dropout_rate)(concat1) # decoder1 returns sequence of hidden states\n # Note: \"TimeDistributed\" applies Dense-Layer on each sequence returned by decoder1\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=[input1, input2], outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit([X_train_T, X_train_F_prev], X_train_F_fut, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=([X_valid_T, X_valid_F_prev], X_valid_F_fut), verbose=1, shuffle=True)\n\n return history, autoencoder_model_ts\n\n#%%\n\n''' --> here: repeatVector used in this Version '''\n\n\ndef create_stacked_UBER_autoencoder_2H(X_train_T, X_train_F_prev, X_train_F_fut, X_valid_T, X_valid_F_prev, X_valid_F_fut, n_timesteps_F, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n # define input of regular sliding window sequence (previous 't' days)\n input1 = Input(shape=(X_train_T.shape[1], X_train_T.shape[2]))\n # create LSTM layers:\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input1)\n encoder2 = LSTM(n_hidden2_units, activation='tanh',\n dropout=dropout_rate)(encoder1)\n # encoder2 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder3 = RepeatVector(n_timesteps_F)(encoder2)\n # define input of additional sliding window sequence (previous 'F' days of target) -> defined in UBER Paper..\n input2 = Input(shape=(X_train_F_prev.shape[1], X_train_F_prev.shape[2]))\n # create concat layer to add additional sequence input on axis=2:\n concat1 = Concatenate(axis=2)([encoder3, input2])\n # Decoding Module:\n decoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(concat1)\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=[input1, input2], outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit([X_train_T, X_train_F_prev], X_train_F_fut, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=([X_valid_T, X_valid_F_prev], X_valid_F_fut), verbose=1, shuffle=True)\n\n return history, autoencoder_model_ts\n\n#%%\n\ndef create_UBER_prediction_model_3H(X_train_encoded, y_train, X_valid_encoded, y_valid, n_preds, n_hidden1_units, n_hidden2_units, n_hidden3_units, dropout_rate, n_batch_size_training, n_epochs, shuffle_flag):\n # define prediction model:\n # we only need 2D shape for Dense-Layers..\n input_data = Input(shape=(X_train_encoded.shape[1], ))\n dense1 = Dense(n_hidden1_units, activation='tanh')(input_data)\n dropout1 = Dropout(dropout_rate)(dense1)\n dense2 = Dense(n_hidden2_units, activation='tanh')(dropout1)\n dropout2 = Dropout(dropout_rate)(dense2)\n dense3 = Dense(n_hidden3_units, activation='tanh')(dropout2)\n dropout3 = Dropout(dropout_rate)(dense3)\n out2 = Dense(n_preds)(dropout3)\n\n # compile model:\n predict_model = Model(input_data, out2)\n predict_model.compile(loss='mse', optimizer='adam', metrics=['mae'])\n\n history = predict_model.fit(X_train_encoded, y_train, epochs=n_epochs, batch_size=n_batch_size_training,\n validation_data=(X_valid_encoded, y_valid), verbose=1, shuffle=shuffle_flag)\n\n return history, predict_model\n\n\n# # get predictions\n\n#%%\n\n# get prediction for scaled + differenced input_data (assuming scaled input) !!!!!!:\n\n# assuming encoding & reshaping for input_data was already done\n\ndef get_rescaled_decoded_predictions(model, multivariate_flag, X_test_encoded, year_to_access, scaler_list, standardizing_flag, scale_range, n_preds, original_complete_dataset, model_name):\n\n # 1) get predictions:\n print('get predictions of model...')\n yhat = model.predict(X_test_encoded, verbose=1)\n # print(yhat.shape)\n # print(yhat)\n\n print('yhat shape: ', yhat.shape)\n\n # 2) rescale predictions (use scaler of each area):\n if multivariate_flag == False:\n yhat_rescaled_all = invert_data_scaler(\n scaler_list[0], yhat, standardizing_flag, scale_range, n_preds)\n\n else:\n yhat_rescaled_all_list = []\n for i in range(len(scaler_list)):\n # slice predictions for each area and rescale predictions: (Note: slicing columns of numpy array returns a list! -> reshaping necessary afterwards)\n yhat_area_i = yhat[:, i]\n # reshape unscaled predictions for scaler:\n yhat_area_i = yhat_area_i.reshape(len(yhat_area_i), 1)\n # apply scaler:\n # Note: for multivariate case n_preds is set to \"1\" since we only take first column of scaler for each area\n yhat_rescaled_area_i = invert_data_scaler(\n scaler_list[i], yhat_area_i, standardizing_flag, scale_range, 1)\n yhat_rescaled_all_list.append(yhat_rescaled_area_i)\n\n # restore numpy_array based on yhat_rescaled_all_list: (this way we have all rescaled predictions for each area in one big numpy array)\n yhat_rescaled_all = yhat_rescaled_all_list[0]\n for i in range(1, len(yhat_rescaled_all_list)):\n yhat_rescaled_all = np.concatenate(\n (yhat_rescaled_all, yhat_rescaled_all_list[i]), axis=1)\n\n print('First 2 scaled predictions')\n print(yhat_rescaled_all[0:2])\n print('Shape of predictions:', yhat_rescaled_all.shape)\n\n # 3) compare predictions with actuals / invert differencing:\n if multivariate_flag == True:\n print('Invert Differencing of multivariate predictions...')\n # invert differencing: (adding value of previous timestep)\n predictions_all = invert_differencing(\n original_complete_dataset, yhat_rescaled_all, year_to_access)\n\n print('predictions preview:')\n print(predictions_all.head())\n\n # 4) get rmse for each timeseries\n rmse_per_ts = []\n for u in range(n_preds):\n rmse_single_ts = np.sqrt(mean_squared_error(\n original_complete_dataset.loc[year_to_access].iloc[:, u], predictions_all.iloc[:, u]))\n rmse_per_ts.append(rmse_single_ts)\n print('RMSE per TS {} for model: {}: {}'.format(\n u, model_name, rmse_per_ts[u]))\n\n # get average of all rmses\n total_rmse = np.mean(rmse_per_ts)\n print('Avg.RMSE for multivariate model: {}: {}'.format(\n model_name, total_rmse))\n\n else:\n # invert differencing: (adding value of previous timestep)\n print('Invert Differencing of predictions...')\n predictions_all = invert_differencing(\n original_complete_dataset, yhat_rescaled_all[:, 0], year_to_access)\n\n print('predictions preview:')\n print(predictions_all.head())\n\n # 4) get rmse:\n rmse = np.sqrt(mean_squared_error(\n original_complete_dataset[year_to_access], predictions_all))\n print('RMSE for model: {}: {}'.format(model_name, rmse))\n\n # return RMSE results:\n rmse_results = []\n if multivariate_flag == True:\n rmse_results.append(total_rmse)\n rmse_results.append(rmse_per_ts)\n\n else:\n rmse_results.append(rmse)\n\n return predictions_all, rmse_results\n\n\n# # functions to store predictions on disk:\n\n#%%\n\n# safe predictions and history into df: (assuming models are trained with MSE: training and validation set)\ndef preds_into_df(preds_df, original_complete_dataset, multivariate_flag, year_to_store):\n # add actuals to single df:\n if multivariate_flag == False:\n print('Add Actuals to df for single area...')\n # create dataframe to add column easily:\n preds_df = pd.DataFrame(preds_df)\n # store actual values to calculate RMSE quickly\n preds_df['actuals'] = original_complete_dataset[year_to_store]\n\n print('creation of preds_df done')\n\n return preds_df\n\n\ndef trainhistory_into_df(encoder_training_history, prediction_model_training_history, hist_col_labels):\n # create df for traning_history:\n hist_encoder_df = pd.DataFrame(\n encoder_training_history.history['loss'], columns=[hist_col_labels[0]])\n hist_encoder_df[hist_col_labels[1]\n ] = encoder_training_history.history['val_loss']\n\n hist_prediction_df = pd.DataFrame(\n prediction_model_training_history.history['loss'], columns=[hist_col_labels[0]])\n hist_prediction_df[hist_col_labels[1]\n ] = prediction_model_training_history.history['val_loss']\n\n print('creation of history_dfs done')\n\n return hist_encoder_df, hist_prediction_df\n\n\n# store predictions and training_history on disk:\ndef store_preds_and_trainhistory_on_disk(preds_df, hist_encoder_df, hist_prediction_df, preds_df_filename, hist_encoder_df_filename, hist_prediction_df_filename, Store_PATH):\n # get path where to store df:\n preds_df_final_path = os.path.join(Store_PATH, preds_df_filename)\n # store df:\n preds_df.to_csv(preds_df_final_path, header=True)\n\n # store history:\n hist_encoder_df_final_path = os.path.join(\n Store_PATH, hist_encoder_df_filename)\n hist_encoder_df.to_csv(hist_encoder_df_final_path, header=True)\n\n hist_prediction_df_final_path = os.path.join(\n Store_PATH, hist_prediction_df_filename)\n hist_prediction_df.to_csv(hist_prediction_df_final_path, header=True)\n\n print('Save df on disk done')\n\n#%%\n\n# save models to JSON -> check that weights_file name uses \".h5\" format & model_file_name \".json\"\ndef save_models_to_json(model_file_name, model_weights_file_name, Store_PATH, model):\n\n # create paths for model architecture & weights:\n model_final_path = Store_PATH + model_file_name\n weights_final_path = Store_PATH + model_weights_file_name\n\n # store model & weights:\n model_as_json = model.to_json()\n with open(model_final_path, \"w\") as json_file:\n json_file.write(model_as_json)\n # serialize weights to HDF5\n model.save_weights(weights_final_path)\n print(\"Saved model to disk\")\n\n#%%\n\n# store history + prediction_results in dicts diretly on disk with the help of other functions:\ndef store_results_of_dicts_on_disk(dict_to_access, validation_set_year, test_set_year, Df_Store_PATH, Model_Save_PATH, RMSE_Store_PATH, RMSE_df_name):\n\n # create & store dfs of Results:\n for key in dict_to_access:\n print('Store results of key: ', key)\n\n # store test_results:\n if 'multivariate' not in key:\n multivariate_flag = False\n\n # get prediction_df:\n preds_df = preds_into_df(\n dict_to_access[key][0][2], dict_to_access[key][0][4], multivariate_flag, test_set_year)\n\n # create history dfs:\n hist_col_labels = [\n 'loss (mse)', 'mae', 'val_loss (mse)', 'val_mae']\n hist_encoder_df, hist_prediction_df = trainhistory_into_df(\n dict_to_access[key][0][8], dict_to_access[key][0][0], hist_col_labels)\n\n # create validation_df:\n valid_preds_df = dict_to_access[key][0][5]\n valid_preds_df = pd.DataFrame(valid_preds_df)\n # store actual values of original dataset\n valid_preds_df['actuals'] = dict_to_access[key][0][4].loc[validation_set_year]\n\n else:\n multivariate_flag = True\n # get prediction_df:\n preds_df = preds_into_df(\n dict_to_access[key][0][2], dict_to_access[key][0][4], multivariate_flag, test_set_year)\n\n # create history dfs:\n hist_col_labels = [\n 'loss (mse)', 'mae', 'val_loss (mse)', 'val_mae']\n hist_encoder_df, hist_prediction_df = trainhistory_into_df(\n dict_to_access[key][0][8], dict_to_access[key][0][0], hist_col_labels)\n\n # create validation_df for multivariate case:\n valid_preds_df = dict_to_access[key][0][5]\n\n # create filenames:\n results_file = 'results_' + dict_to_access[key][0][3] + '.csv'\n history_encoder_file = 'history_autoencoder_' + \\\n dict_to_access[key][0][3] + '.csv'\n history_predict_file = 'history_predict_' + \\\n dict_to_access[key][0][3] + '.csv'\n valid_results_file = 'validation_results_' + \\\n dict_to_access[key][0][3] + '.csv'\n\n # store test results:\n store_preds_and_trainhistory_on_disk(preds_df, hist_encoder_df, hist_prediction_df,\n results_file, history_encoder_file, history_predict_file, Df_Store_PATH)\n # store validation results:\n # get path where to store df:\n valid_df_final_path = os.path.join(Df_Store_PATH, valid_results_file)\n # store df:\n valid_preds_df.to_csv(valid_df_final_path, header=True)\n\n # store model architecture (architecture + weights):\n autoencoder_model_architecture_file = 'autoencoder_model_' + \\\n dict_to_access[key][0][3] + '.json'\n autoencoder_model_weights_file = 'autoencoder_model_' + \\\n dict_to_access[key][0][3] + '_weights.h5'\n predict_model_architecture_file = 'predict_model_' + \\\n dict_to_access[key][0][3] + '.json'\n predict_model_weights_file = 'predict_model_' + \\\n dict_to_access[key][0][3] + '_weights.h5'\n\n # call function to save models:\n # save autoencoder model:\n save_models_to_json(autoencoder_model_architecture_file,\n autoencoder_model_weights_file, Model_Save_PATH, dict_to_access[key][0][9])\n # save predict model:\n save_models_to_json(predict_model_architecture_file,\n predict_model_weights_file, Model_Save_PATH, dict_to_access[key][0][1])\n\n # store RMSE results of models:\n # create dict to store RMSE results:\n dict_test_rmse = {}\n dict_valid_rmse = {}\n\n # add values to dict:\n # Note: for multivar_models only avg. RMSE is stored!!\n if 'multivar' in key:\n dict_valid_rmse[key] = []\n dict_test_rmse[key] = []\n # append avg. RMSE results of multivar model:\n dict_valid_rmse[key].append(dict_to_access[key][0][7][0])\n dict_test_rmse[key].append(dict_to_access[key][0][6][0])\n\n else:\n dict_test_rmse[key] = []\n dict_valid_rmse[key] = []\n dict_valid_rmse[key].append(dict_to_access[key][0][7])\n dict_test_rmse[key].append(dict_to_access[key][0][6])\n\n # create Df from rmse_dicts:\n rmse_valid_df = pd.DataFrame.from_dict(dict_valid_rmse, orient='index')\n rmse_test_df = pd.DataFrame.from_dict(dict_test_rmse, orient='index')", "target_code": " all_rmse_df = pd.concat([rmse_valid_df, rmse_test_df], axis=1, names=[\n 'RMSE_Validation', 'RMSE_Test'])\n all_rmse_df.columns = ['RMSE_Validation', 'RMSE_Test']\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# In this notebook different autoencoder architectures are implemented: Each architecture is based on an encoding-module and a prediction module. The window size refers to how many timesteps are used to make a prediction. \"Multivariate\" models process input data of multiple areas at the same time to predict demand.\n\n# # Import Modules\n\n\nimport tensorflow as tf\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\nimport matplotlib.dates as mdates\n\nimport datetime\n\nimport sklearn.preprocessing\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.preprocessing import StandardScaler\n\nimport itertools\n\nimport os\n\n#from tqdm import tqdm\n\nfrom keras.models import Model\nfrom keras.models import Sequential\nfrom keras.layers import LSTM\nfrom keras.layers import Dense\nfrom keras.layers import Input\nfrom keras.layers import TimeDistributed\nfrom keras.layers import RepeatVector\nfrom keras.layers import Concatenate\nfrom keras.layers import Dropout\n\nfrom keras import optimizers\nfrom keras.models import model_from_json\nfrom keras import regularizers\n\nfrom keras.callbacks import EarlyStopping\n\nimport save_files_collection as sf\n\n\nfrom keras import backend as K\nK.tensorflow_backend._get_available_gpus()\n\n\nsess = tf.Session(config=tf.ConfigProto(log_device_placement=True))\n\n\n# load data\nTRAIN_PATH = '/media/...'\nStore_PATH = '/media/...'\nfile_name = 'ts_10largest.csv'\nts_10largest = pd.read_csv(TRAIN_PATH + file_name,\n header=0, parse_dates=['date'], index_col='date')\n\n\n# # Data Preparation\n\n\n''' Time Series Preparation '''\n\n# Preprocessing Steps: 1) Differencing 2) Scaling 3) Supervised_Sequence\n\n# make timeseries stationary\n\n\ndef differencing(dataset, shift_interval):\n shifted = dataset.shift(shift_interval).copy(deep=True)\n diff_series = dataset - shifted\n diff_series.dropna(inplace=True)\n return diff_series\n\n\ndef invert_differencing(history, predictions, year_to_access):\n history_copy = history.copy()\n history_shift = history_copy.shift(1)\n history_shift.dropna(inplace=True)\n preds_all = predictions + history_shift.loc[year_to_access]\n\n return preds_all\n\n\n# it is recommended to scale input data in the range of the activation function used by the network. Since we use \"tanh\"..\n# scale datasets based on fit of train_set; input set has to be a reshaped numpy_array!\ndef data_scaler(train_data, valid_data, test_data, scale_range, standardizing_flag=False):\n\n if standardizing_flag == False:\n print('MinMax-Scaling used...')\n # use MinMax-Scaling based on scale_range given...\n # feature_range=(-1, 1)\n scaler = MinMaxScaler(feature_range=scale_range)\n scaler = scaler.fit(train_data)\n # scale train_set:\n train_scaled = scaler.transform(train_data)\n train_scaled_df = pd.DataFrame(train_scaled)\n # scale valididation data based on scaler fitted on training data:\n valid_scaled = scaler.transform(valid_data)\n valid_scaled_df = pd.DataFrame(valid_scaled)\n # scale test_set based on scaler fitted on training data:\n test_scaled = scaler.transform(test_data)\n test_scaled_df = pd.DataFrame(test_scaled)\n\n else:\n print('Standardizing used...')\n\n scaler = StandardScaler()\n scaler = scaler.fit(train_data)\n # scale train_set:\n train_scaled = scaler.transform(train_data)\n train_scaled_df = pd.DataFrame(train_scaled)\n # scale valididation data based on scaler fitted on training data:\n valid_scaled = scaler.transform(valid_data)\n valid_scaled_df = pd.DataFrame(valid_scaled)\n # scale test_set based on scaler fitted on training data:\n test_scaled = scaler.transform(test_data)\n test_scaled_df = pd.DataFrame(test_scaled)\n\n # return data as df, since \"supervised\" function requires df or series as input\n return scaler, train_scaled_df, valid_scaled_df, test_scaled_df\n\n\n# function to invert scaling:\ndef invert_data_scaler(scaler, predictions_data, standardizing_flag, scale_range, n_preds):\n\n if standardizing_flag == False:\n # create new scaler: -> necessary since old scaler might be fitted on lagged values -> expects more dimensions as the predictions have!\n new_scaler = MinMaxScaler(feature_range=scale_range)\n # copy attributes of old scaler to new one: but only for the first columns for which we have real predictions\n new_scaler.min_, new_scaler.scale_ = scaler.min_[\n :n_preds], scaler.scale_[:n_preds]\n\n preds = new_scaler.inverse_transform(predictions_data)\n\n else:\n new_scaler = StandardScaler()\n # copy attributes of old scaler to new one: but only for the first columns for which we have real predictions\n new_scaler.mean_, new_scaler.scale_ = scaler.mean_[\n :n_preds], scaler.scale_[:n_preds]\n\n preds = new_scaler.inverse_transform(predictions_data)\n\n return preds\n\n\ndef create_supervised_data_single_ts(single_ts, n_timesteps):\n # copy dfs to ensure no changes are made on original dataset:\n sequence_copy = single_ts.copy(deep=True)\n # create dfs to easily append new columns and access columns:\n sequence_copy = pd.DataFrame(sequence_copy)\n sequence_df = pd.DataFrame(sequence_copy)\n\n # Note: range starts reversed to make sure the lags are in correct order\n for i in range(n_timesteps, 0, -1):\n sequence_df['lag_{}'.format(i)] = sequence_copy.iloc[:, 0].shift(i)\n\n # drop rows with NaNs -> if no lagged features are available, we drop the whole row\n sequence_df.dropna(inplace=True)\n\n return sequence_df\n\n\ndef create_supervised_UBER_data_single_ts(single_ts, n_timesteps_T, n_timesteps_F):\n # copy dfs to ensure no changes are made on original dataset:\n sequence_copy = single_ts.copy(deep=True)\n # create dfs to easily append new columns and access columns:\n sequence_copy = pd.DataFrame(sequence_copy)\n sequence_df = pd.DataFrame(sequence_copy)\n\n # Note: range starts reversed to make sure the lags are in correct order\n for i in range(n_timesteps_T, 0, -1):\n sequence_df['lag_T{}'.format(i)] = sequence_copy.iloc[:, 0].shift(i)\n for i in range(n_timesteps_F, 0, -1):\n sequence_df['prev_lag_F{}'.format(\n i)] = sequence_copy.iloc[:, 0].shift(i)\n # Note: for future values range starts \"normal\":\n for i in range(1, n_timesteps_F+1):\n sequence_df['fut_{}'.format(i)] = sequence_copy.iloc[:, 0].shift(-i)\n\n # drop rows with NaNs -> if no lagged features are available, we drop the whole row\n sequence_df.dropna(inplace=True)\n\n return sequence_df\n\n\n# # preprocess data for models:\n\n\ndef generate_UBER_data_autoencoder(ts_series, multivariate_flag, last_train_set_year,\n validation_set_year, test_set_year, n_timesteps_T, n_timesteps_F, n_preds,\n scale_range, standardizing_flag):\n # prepare data:\n\n # 1) apply differencing\n ts_diff = differencing(ts_series, 1)\n\n # reassign df:\n ts_series = ts_diff\n\n # 2) get supervised data:\n if multivariate_flag == False:\n # change type of ts_series:\n # -> this way we can access the \"column\" parameter\n ts_series = pd.DataFrame(ts_series)\n\n # prepare dict to store results of each area:\n area_labels = list(ts_series.columns)\n area_supervised_dict = {}\n scaler_list = []\n\n # get data for each area:\n for i in range(len(area_labels)):\n print('data for area{} is prepared...'.format(area_labels[i]))\n # create key:\n area_supervised_dict['area{}'.format(area_labels[i])] = []\n # get supervised data:\n supervised_data_df = create_supervised_UBER_data_single_ts(\n ts_series.iloc[:, i], n_timesteps_T, n_timesteps_F)\n\n # get train/test split:\n ts_train_all, ts_test = supervised_data_df.loc[:\n validation_set_year], supervised_data_df.loc[test_set_year]\n # get train/validation Split:\n ts_train, ts_valid = ts_train_all.loc[:\n last_train_set_year], ts_train_all.loc[validation_set_year]\n\n # scale data:\n # create numpy arrays for scaler:\n ts_train_array = ts_train.values\n ts_valid_array = ts_valid.values\n ts_test_array = ts_test.values\n\n print('Data is scaled...')\n scaler, train_scaled, valid_scaled, test_scaled = data_scaler(\n ts_train_array, ts_valid_array, ts_test_array, scale_range, standardizing_flag)\n\n # store scaler:\n scaler_list.append(scaler)\n\n # restore index:\n train_scaled.index = ts_train.index\n valid_scaled.index = ts_valid.index\n test_scaled.index = ts_test.index\n\n # slice lags_T, lags_F, future_F values:\n # slice lags_T which are actually X:\n X_train = train_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_valid = valid_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_test = test_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n # get y values:\n y_train = train_scaled.iloc[:, 0].values\n y_valid = valid_scaled.iloc[:, 0].values\n y_test = test_scaled.iloc[:, 0].values\n # slice lags_F:\n lags_F_train = train_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n lags_F_valid = valid_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n lags_F_test = test_scaled.iloc[:, -\n (n_timesteps_F + n_timesteps_F):-(n_timesteps_F)].values\n # slice future_F:\n fut_F_train = train_scaled.iloc[:, -(n_timesteps_F):].values\n fut_F_valid = valid_scaled.iloc[:, -(n_timesteps_F):].values\n fut_F_test = test_scaled.iloc[:, -(n_timesteps_F):].values\n\n # reshape X, y and lags_F and fut_F data to easily append other areas later on:\n list_to_reshape_X_lags_F_fut_F = [X_train, X_valid, X_test, lags_F_train,\n lags_F_valid, lags_F_test, fut_F_train, fut_F_valid, fut_F_test]\n list_to_reshape_y = [y_train, y_valid, y_test]\n reshaped_list_X_lags_F_fut_F = []\n reshaped_list_y = []\n # reshape X:\n for array in list_to_reshape_X_lags_F_fut_F:\n # shape: (#n_samples,n_lags,n_areas)\n array = array.reshape((array.shape[0], array.shape[1], 1))\n reshaped_list_X_lags_F_fut_F.append(array)\n\n # reshape y:\n for array in list_to_reshape_y:\n # shape: (#n_samples,n_areas)\n array = array.reshape((array.shape[0], 1))\n reshaped_list_y.append(array)\n\n # append results to dict:\n items_to_append = [reshaped_list_X_lags_F_fut_F[0], reshaped_list_y[0], reshaped_list_X_lags_F_fut_F[3], reshaped_list_X_lags_F_fut_F[6],\n reshaped_list_X_lags_F_fut_F[1], reshaped_list_y[\n 1], reshaped_list_X_lags_F_fut_F[4], reshaped_list_X_lags_F_fut_F[7],\n reshaped_list_X_lags_F_fut_F[2], reshaped_list_y[2], reshaped_list_X_lags_F_fut_F[5], reshaped_list_X_lags_F_fut_F[8]]\n\n for u in range(len(items_to_append)):\n area_supervised_dict['area{}'.format(\n area_labels[i])].append(items_to_append[u])\n\n # print shapes:\n print('shape of X_train single area: ',\n reshaped_list_X_lags_F_fut_F[0].shape)\n print('shape of X_valid single area: ',\n reshaped_list_X_lags_F_fut_F[1].shape)\n print('shape of X_test single area: ',\n reshaped_list_X_lags_F_fut_F[2].shape)\n print('shape of y_train single area: ', reshaped_list_y[0].shape)\n print('shape of y_valid single area: ', reshaped_list_y[1].shape)\n print('shape of y_test single area: ', reshaped_list_y[2].shape)\n print('shape of prev_lags_F_train single area: ',\n reshaped_list_X_lags_F_fut_F[3].shape)\n print('shape of fut_F_values_train single area: ',\n reshaped_list_X_lags_F_fut_F[6].shape)\n\n # concat results of all areas:\n key_list = list(area_supervised_dict.keys())\n\n if multivariate_flag == True:\n # create training set, valid & test set containing inputs of all selected areas: -> append all areas into one big np.array!\n\n # fill arrays with entries of first area:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][8], area_supervised_dict[key_list[0]][9]\n\n lags_F_train = area_supervised_dict[key_list[0]][2]\n lags_F_valid = area_supervised_dict[key_list[0]][6]\n lags_F_test = area_supervised_dict[key_list[0]][10]\n\n fut_F_train = area_supervised_dict[key_list[0]][3]\n fut_F_valid = area_supervised_dict[key_list[0]][7]\n fut_F_test = area_supervised_dict[key_list[0]][11]\n\n # concat remaining areas:\n for i in range(1, len(key_list)):\n X_train = np.concatenate(\n (X_train, area_supervised_dict[key_list[i]][0]), axis=2)\n X_valid = np.concatenate(\n (X_valid, area_supervised_dict[key_list[i]][4]), axis=2)\n X_test = np.concatenate(\n (X_test, area_supervised_dict[key_list[i]][8]), axis=2)\n y_train = np.concatenate(\n (y_train, area_supervised_dict[key_list[i]][1]), axis=1)\n y_valid = np.concatenate(\n (y_valid, area_supervised_dict[key_list[i]][5]), axis=1)\n y_test = np.concatenate(\n (y_test, area_supervised_dict[key_list[i]][9]), axis=1)\n\n lags_F_train = np.concatenate(\n (lags_F_train, area_supervised_dict[key_list[i]][2]), axis=2)\n lags_F_valid = np.concatenate(\n (lags_F_valid, area_supervised_dict[key_list[i]][6]), axis=2)\n lags_F_test = np.concatenate(\n (lags_F_test, area_supervised_dict[key_list[i]][10]), axis=2)\n\n fut_F_train = np.concatenate(\n (fut_F_train, area_supervised_dict[key_list[i]][3]), axis=2)\n fut_F_valid = np.concatenate(\n (fut_F_valid, area_supervised_dict[key_list[i]][7]), axis=2)\n fut_F_test = np.concatenate(\n (fut_F_test, area_supervised_dict[key_list[i]][11]), axis=2)\n\n else:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][8], area_supervised_dict[key_list[0]][9]\n\n lags_F_train = area_supervised_dict[key_list[0]][2]\n lags_F_valid = area_supervised_dict[key_list[0]][6]\n lags_F_test = area_supervised_dict[key_list[0]][10]\n\n fut_F_train = area_supervised_dict[key_list[0]][3]\n fut_F_valid = area_supervised_dict[key_list[0]][7]\n fut_F_test = area_supervised_dict[key_list[0]][11]\n\n print('final shape of X_train: ', X_train.shape)\n\n # return area_supervised_dict\n\n # Note: We only need to return lags_F & fut_F_values for train and valid set since autoencoder is only \"trained\" on these sets..\n return (X_train, y_train, X_valid, y_valid, X_test, y_test, lags_F_train, lags_F_valid, fut_F_train,\n fut_F_valid, scaler_list)\n\n\ndef generate_data_autoencoder(ts_series, multivariate_flag, last_train_set_year, validation_set_year, test_set_year,\n n_timesteps, n_preds, scale_range, standardizing_flag):\n # prepare data:\n\n # 1) apply differencing\n ts_diff = differencing(ts_series, 1)\n\n # reassign df:\n ts_series = ts_diff\n\n # 2) get supervised data:\n if multivariate_flag == False:\n # change type of ts_series:\n # -> this way we can access the \"column\" parameter\n ts_series = pd.DataFrame(ts_series)\n\n # prepare dict to store results of each area:\n area_labels = list(ts_series.columns)\n area_supervised_dict = {}\n scaler_list = []\n\n # get data for each area:\n for i in range(len(area_labels)):\n print('data for area{} is prepared...'.format(area_labels[i]))\n # create key:\n area_supervised_dict['area{}'.format(area_labels[i])] = []\n # get supervised data:\n supervised_data_df = create_supervised_data_single_ts(\n ts_series.iloc[:, i], n_timesteps)\n\n # get train/test split:\n ts_train_all, ts_test = supervised_data_df.loc[:\n validation_set_year], supervised_data_df.loc[test_set_year]\n # get train/validation Split:\n ts_train, ts_valid = ts_train_all.loc[:\n last_train_set_year], ts_train_all.loc[validation_set_year]\n\n # scale data:\n # create numpy arrays for scaler:\n ts_train_array = ts_train.values\n ts_valid_array = ts_valid.values\n ts_test_array = ts_test.values\n\n print('Data is scaled...')\n scaler, train_scaled, valid_scaled, test_scaled = data_scaler(\n ts_train_array, ts_valid_array, ts_test_array, scale_range, standardizing_flag)\n\n # store scaler:\n scaler_list.append(scaler)\n\n # restore index:\n train_scaled.index = ts_train.index\n valid_scaled.index = ts_valid.index\n test_scaled.index = ts_test.index\n\n # get X, y pairs:\n X_train = train_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_valid = valid_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n X_test = test_scaled.iloc[:, 1:(n_timesteps_T+1)].values\n # get y values:\n y_train = train_scaled.iloc[:, 0].values\n y_valid = valid_scaled.iloc[:, 0].values\n y_test = test_scaled.iloc[:, 0].values\n\n # reshape X, y to easily append other areas later on:\n list_to_reshape_X = [X_train, X_valid, X_test]\n list_to_reshape_y = [y_train, y_valid, y_test]\n reshaped_list_X = []\n reshaped_list_y = []\n # reshape X:\n for array in list_to_reshape_X:\n # shape: (#n_samples,n_lags,n_areas)\n array = array.reshape((array.shape[0], array.shape[1], 1))\n reshaped_list_X.append(array)\n\n # reshape y:\n for array in list_to_reshape_y:\n # shape: (#n_samples,n_areas)\n array = array.reshape((array.shape[0], 1))\n reshaped_list_y.append(array)\n\n # append results to dict:\n items_to_append = [reshaped_list_X[0], reshaped_list_y[0],\n reshaped_list_X[1], reshaped_list_y[1],\n reshaped_list_X[2], reshaped_list_y[2]]\n\n for u in range(len(items_to_append)):\n area_supervised_dict['area{}'.format(\n area_labels[i])].append(items_to_append[u])\n\n # print shapes:\n print('shape of X_train single area: ', reshaped_list_X[0].shape)\n print('shape of X_valid single area: ', reshaped_list_X[1].shape)\n print('shape of X_test single area: ', reshaped_list_X[2].shape)\n print('shape of y_train single area: ', reshaped_list_y[0].shape)\n print('shape of y_valid single area: ', reshaped_list_y[1].shape)\n print('shape of y_test single area: ', reshaped_list_y[2].shape)\n\n # concat results of all areas:\n key_list = list(area_supervised_dict.keys())\n\n if multivariate_flag == True:\n # create training set, valid & test set containing inputs of all selected areas: -> append all areas into one big np.array!\n\n # fill arrays with entries of first area:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][2], area_supervised_dict[key_list[0]][3]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n\n # concat remaining areas:\n for i in range(1, len(key_list)):\n X_train = np.concatenate(\n (X_train, area_supervised_dict[key_list[i]][0]), axis=2)\n X_valid = np.concatenate(\n (X_valid, area_supervised_dict[key_list[i]][2]), axis=2)\n X_test = np.concatenate(\n (X_test, area_supervised_dict[key_list[i]][4]), axis=2)\n y_train = np.concatenate(\n (y_train, area_supervised_dict[key_list[i]][1]), axis=1)\n y_valid = np.concatenate(\n (y_valid, area_supervised_dict[key_list[i]][3]), axis=1)\n y_test = np.concatenate(\n (y_test, area_supervised_dict[key_list[i]][5]), axis=1)\n\n else:\n X_train, y_train = area_supervised_dict[key_list[0]\n ][0], area_supervised_dict[key_list[0]][1]\n X_valid, y_valid = area_supervised_dict[key_list[0]\n ][2], area_supervised_dict[key_list[0]][3]\n X_test, y_test = area_supervised_dict[key_list[0]\n ][4], area_supervised_dict[key_list[0]][5]\n\n print('final shape of X_train: ', X_train.shape)\n\n return X_train, y_train, X_valid, y_valid, X_test, y_test, scaler_list\n\n\n# # Model Definition\n\n# ## simpler autoencoder with either 1 hidden layer or 2 hidden layers\n\n\ndef create_autoencoder_1H(X_train, X_valid, n_timesteps, n_preds, n_hidden1_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n input1 = Input(shape=(X_train.shape[1], X_train.shape[2]))\n encoder = LSTM(n_hidden1_units, activation='tanh',\n dropout=dropout_rate)(input1)\n # encoder layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector which returns the meaningful vector \"n\" times\n # define reconstruct decoder\n # needed to keep the 3D shape (X_train.shape[1])\n encoder2 = RepeatVector(n_timesteps)(encoder)\n decoder = LSTM(n_hidden1_units, activation='tanh', return_sequences=True,\n dropout=dropout_rate)(encoder2) # previously 32units\n decoder = TimeDistributed(Dense(n_preds))(decoder)\n\n autoencoder_model_ts = Model(inputs=input1, outputs=decoder)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit(X_train, X_train, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=(X_valid, X_valid), verbose=1, shuffle=True)\n return history, autoencoder_model_ts\n\n\ndef create_stacked_autoencoder_2H(X_train, X_valid, n_timesteps, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n input1 = Input(shape=(X_train.shape[1], X_train.shape[2]))\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input1)\n encoder2 = LSTM(n_hidden2_units, activation='tanh',\n dropout=dropout_rate)(encoder1)\n # encoder2 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder3 = RepeatVector(n_timesteps)(encoder2)\n decoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(encoder3)\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=input1, outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit(X_train, X_train, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=(X_valid, X_valid), verbose=1, shuffle=True)\n return history, autoencoder_model_ts\n\n\ndef create_prediction_model_2H(X_train_encoded, y_train, X_valid_encoded, y_valid, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs, shuffle_flag, early_stopping_flag):\n\n # create callback list for different callbacks:\n callback_list = []\n\n if early_stopping_flag == True:\n print('Early Stopping applied')\n # create Callback for EarlyStopping:\n early_stop = EarlyStopping(\n monitor='val_loss', mode='min', verbose=1, patience=20)\n # append callback to list:\n callback_list.append(early_stop)\n\n # define prediction model:\n input2 = Input(shape=(X_train_encoded.shape[1], X_train_encoded.shape[2]))\n lstm_2 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input2)\n lstm_2 = LSTM(n_hidden2_units, activation='tanh',\n return_sequences=False, dropout=dropout_rate)(lstm_2)\n #dense2 = Dense(50)(lstm_2)\n out2 = Dense(n_preds)(lstm_2)\n\n predict_model = Model(input2, out2)\n predict_model.compile(loss='mse', optimizer='adam', metrics=['mae'])\n\n history = predict_model.fit(X_train_encoded, y_train, epochs=n_epochs, batch_size=n_batch_size_training, validation_data=(\n X_valid_encoded, y_valid), verbose=1, shuffle=shuffle_flag, callbacks=callback_list)\n return history, predict_model\n\n\n# ## UBER autoencoder with either 1 hidden layer or 2 hidden layers\n\n\n'''Note: There are different ways how the autoencoder can be constructed:\n 1) LSTM LAyers of Encoding Scheme don't use return_sequences -> this way a static embedding is returned after all timesteps are processed, -> a RepeatVector-Layer is needed which creates multiple copies of the static embedding\n 2) LSTM LAyers of Encoding Scheme use return_sequences -> The hidden state of each time step is returned -> we don't need RepeatVector-Layer\n \n --> here: repeatVector used in this Version\n'''\n\n\ndef create_UBER_autoencoder_1H_with_RepeatVect(X_train_T, X_train_F_prev, X_train_F_fut, X_valid_T, X_valid_F_prev, X_valid_F_fut, n_timesteps_F, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n # define input of regular sliding window sequence (previous 't' days)\n input1 = Input(shape=(X_train_T.shape[1], X_train_T.shape[2]))\n # create LSTM layers:\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=False, dropout=dropout_rate)(input1)\n # encoder1 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder2 = RepeatVector(n_timesteps_F)(encoder1)\n # define input of additional sliding window sequence (previous 'F' days of target) -> defined in UBER Paper..\n input2 = Input(shape=(X_train_F_prev.shape[1], X_train_F_prev.shape[2]))\n # create concat layer to add additional sequence input on axis=2:\n concat1 = Concatenate(axis=2)([encoder2, input2])\n # Decoding Module:\n decoder1 = LSTM(n_hidden2_units, activation='tanh', return_sequences=True,\n dropout=dropout_rate)(concat1) # decoder1 returns sequence of hidden states\n # Note: \"TimeDistributed\" applies Dense-Layer on each sequence returned by decoder1\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=[input1, input2], outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit([X_train_T, X_train_F_prev], X_train_F_fut, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=([X_valid_T, X_valid_F_prev], X_valid_F_fut), verbose=1, shuffle=True)\n\n return history, autoencoder_model_ts\n\n\n''' --> here: repeatVector used in this Version '''\n\n\ndef create_stacked_UBER_autoencoder_2H(X_train_T, X_train_F_prev, X_train_F_fut, X_valid_T, X_valid_F_prev, X_valid_F_fut, n_timesteps_F, n_preds, n_hidden1_units, n_hidden2_units, dropout_rate, n_batch_size_training, n_epochs):\n # create encoder with Keras Functional API:\n # define encoder\n # define input of regular sliding window sequence (previous 't' days)\n input1 = Input(shape=(X_train_T.shape[1], X_train_T.shape[2]))\n # create LSTM layers:\n encoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(input1)\n encoder2 = LSTM(n_hidden2_units, activation='tanh',\n dropout=dropout_rate)(encoder1)\n # encoder2 layer returns a static meaningful vector -> to receive a sequence again, we use RepeatVector..\n # define reconstruct decoder\n # RepeatVector actually belongs to the encoding-module -> name = \"encoder3\"\n encoder3 = RepeatVector(n_timesteps_F)(encoder2)\n # define input of additional sliding window sequence (previous 'F' days of target) -> defined in UBER Paper..\n input2 = Input(shape=(X_train_F_prev.shape[1], X_train_F_prev.shape[2]))\n # create concat layer to add additional sequence input on axis=2:\n concat1 = Concatenate(axis=2)([encoder3, input2])\n # Decoding Module:\n decoder1 = LSTM(n_hidden1_units, activation='tanh',\n return_sequences=True, dropout=dropout_rate)(concat1)\n decoder2 = TimeDistributed(Dense(n_preds))(decoder1)\n\n autoencoder_model_ts = Model(inputs=[input1, input2], outputs=decoder2)\n autoencoder_model_ts.compile(optimizer='adam', loss='mse')\n # fit model\n history = autoencoder_model_ts.fit([X_train_T, X_train_F_prev], X_train_F_fut, batch_size=n_batch_size_training,\n epochs=n_epochs, validation_data=([X_valid_T, X_valid_F_prev], X_valid_F_fut), verbose=1, shuffle=True)\n\n return history, autoencoder_model_ts\n\n\ndef create_UBER_prediction_model_3H(X_train_encoded, y_train, X_valid_encoded, y_valid, n_preds, n_hidden1_units, n_hidden2_units, n_hidden3_units, dropout_rate, n_batch_size_training, n_epochs, shuffle_flag):\n # define prediction model:\n # we only need 2D shape for Dense-Layers..\n input_data = Input(shape=(X_train_encoded.shape[1], ))\n dense1 = Dense(n_hidden1_units, activation='tanh')(input_data)\n dropout1 = Dropout(dropout_rate)(dense1)\n dense2 = Dense(n_hidden2_units, activation='tanh')(dropout1)\n dropout2 = Dropout(dropout_rate)(dense2)\n dense3 = Dense(n_hidden3_units, activation='tanh')(dropout2)\n dropout3 = Dropout(dropout_rate)(dense3)\n out2 = Dense(n_preds)(dropout3)\n\n # compile model:\n predict_model = Model(input_data, out2)\n predict_model.compile(loss='mse', optimizer='adam', metrics=['mae'])\n\n history = predict_model.fit(X_train_encoded, y_train, epochs=n_epochs, batch_size=n_batch_size_training,\n validation_data=(X_valid_encoded, y_valid), verbose=1, shuffle=shuffle_flag)\n\n return history, predict_model\n\n\n# # get predictions\n\n\n# get prediction for scaled + differenced input_data (assuming scaled input) !!!!!!:\n\n# assuming encoding & reshaping for input_data was already done\n\ndef get_rescaled_decoded_predictions(model, multivariate_flag, X_test_encoded, year_to_access, scaler_list, standardizing_flag, scale_range, n_preds, original_complete_dataset, model_name):\n\n # 1) get predictions:\n print('get predictions of model...')\n yhat = model.predict(X_test_encoded, verbose=1)\n # print(yhat.shape)\n # print(yhat)\n\n print('yhat shape: ', yhat.shape)\n\n # 2) rescale predictions (use scaler of each area):\n if multivariate_flag == False:\n yhat_rescaled_all = invert_data_scaler(\n scaler_list[0], yhat, standardizing_flag, scale_range, n_preds)\n\n else:\n yhat_rescaled_all_list = []\n for i in range(len(scaler_list)):\n # slice predictions for each area and rescale predictions: (Note: slicing columns of numpy array returns a list! -> reshaping necessary afterwards)\n yhat_area_i = yhat[:, i]\n # reshape unscaled predictions for scaler:\n yhat_area_i = yhat_area_i.reshape(len(yhat_area_i), 1)\n # apply scaler:\n # Note: for multivariate case n_preds is set to \"1\" since we only take first column of scaler for each area\n yhat_rescaled_area_i = invert_data_scaler(\n scaler_list[i], yhat_area_i, standardizing_flag, scale_range, 1)\n yhat_rescaled_all_list.append(yhat_rescaled_area_i)\n\n # restore numpy_array based on yhat_rescaled_all_list: (this way we have all rescaled predictions for each area in one big numpy array)\n yhat_rescaled_all = yhat_rescaled_all_list[0]\n for i in range(1, len(yhat_rescaled_all_list)):\n yhat_rescaled_all = np.concatenate(\n (yhat_rescaled_all, yhat_rescaled_all_list[i]), axis=1)\n\n print('First 2 scaled predictions')\n print(yhat_rescaled_all[0:2])\n print('Shape of predictions:', yhat_rescaled_all.shape)\n\n # 3) compare predictions with actuals / invert differencing:\n if multivariate_flag == True:\n print('Invert Differencing of multivariate predictions...')\n # invert differencing: (adding value of previous timestep)\n predictions_all = invert_differencing(\n original_complete_dataset, yhat_rescaled_all, year_to_access)\n\n print('predictions preview:')\n print(predictions_all.head())\n\n # 4) get rmse for each timeseries\n rmse_per_ts = []\n for u in range(n_preds):\n rmse_single_ts = np.sqrt(mean_squared_error(\n original_complete_dataset.loc[year_to_access].iloc[:, u], predictions_all.iloc[:, u]))\n rmse_per_ts.append(rmse_single_ts)\n print('RMSE per TS {} for model: {}: {}'.format(\n u, model_name, rmse_per_ts[u]))\n\n # get average of all rmses\n total_rmse = np.mean(rmse_per_ts)\n print('Avg.RMSE for multivariate model: {}: {}'.format(\n model_name, total_rmse))\n\n else:\n # invert differencing: (adding value of previous timestep)\n print('Invert Differencing of predictions...')\n predictions_all = invert_differencing(\n original_complete_dataset, yhat_rescaled_all[:, 0], year_to_access)\n\n print('predictions preview:')\n print(predictions_all.head())\n\n # 4) get rmse:\n rmse = np.sqrt(mean_squared_error(\n original_complete_dataset[year_to_access], predictions_all))\n print('RMSE for model: {}: {}'.format(model_name, rmse))\n\n # return RMSE results:\n rmse_results = []\n if multivariate_flag == True:\n rmse_results.append(total_rmse)\n rmse_results.append(rmse_per_ts)\n\n else:\n rmse_results.append(rmse)\n\n return predictions_all, rmse_results\n\n\n# # functions to store predictions on disk:\n\n\n# safe predictions and history into df: (assuming models are trained with MSE: training and validation set)\ndef preds_into_df(preds_df, original_complete_dataset, multivariate_flag, year_to_store):\n # add actuals to single df:\n if multivariate_flag == False:\n print('Add Actuals to df for single area...')\n # create dataframe to add column easily:\n preds_df = pd.DataFrame(preds_df)\n # store actual values to calculate RMSE quickly\n preds_df['actuals'] = original_complete_dataset[year_to_store]\n\n print('creation of preds_df done')\n\n return preds_df\n\n\ndef trainhistory_into_df(encoder_training_history, prediction_model_training_history, hist_col_labels):\n # create df for traning_history:\n hist_encoder_df = pd.DataFrame(\n encoder_training_history.history['loss'], columns=[hist_col_labels[0]])\n hist_encoder_df[hist_col_labels[1]\n ] = encoder_training_history.history['val_loss']\n\n hist_prediction_df = pd.DataFrame(\n prediction_model_training_history.history['loss'], columns=[hist_col_labels[0]])\n hist_prediction_df[hist_col_labels[1]\n ] = prediction_model_training_history.history['val_loss']\n\n print('creation of history_dfs done')\n\n return hist_encoder_df, hist_prediction_df\n\n\n# store predictions and training_history on disk:\ndef store_preds_and_trainhistory_on_disk(preds_df, hist_encoder_df, hist_prediction_df, preds_df_filename, hist_encoder_df_filename, hist_prediction_df_filename, Store_PATH):\n # get path where to store df:\n preds_df_final_path = os.path.join(Store_PATH, preds_df_filename)\n # store df:\n preds_df.to_csv(preds_df_final_path, header=True)\n\n # store history:\n hist_encoder_df_final_path = os.path.join(\n Store_PATH, hist_encoder_df_filename)\n hist_encoder_df.to_csv(hist_encoder_df_final_path, header=True)\n\n hist_prediction_df_final_path = os.path.join(\n Store_PATH, hist_prediction_df_filename)\n hist_prediction_df.to_csv(hist_prediction_df_final_path, header=True)\n\n print('Save df on disk done')\n\n\n# save models to JSON -> check that weights_file name uses \".h5\" format & model_file_name \".json\"\ndef save_models_to_json(model_file_name, model_weights_file_name, Store_PATH, model):\n\n # create paths for model architecture & weights:\n model_final_path = Store_PATH + model_file_name\n weights_final_path = Store_PATH + model_weights_file_name\n\n # store model & weights:\n model_as_json = model.to_json()\n with open(model_final_path, \"w\") as json_file:\n json_file.write(model_as_json)\n # serialize weights to HDF5\n model.save_weights(weights_final_path)\n print(\"Saved model to disk\")\n\n\n# store history + prediction_results in dicts diretly on disk with the help of other functions:\ndef store_results_of_dicts_on_disk(dict_to_access, validation_set_year, test_set_year, Df_Store_PATH, Model_Save_PATH, RMSE_Store_PATH, RMSE_df_name):\n\n # create & store dfs of Results:\n for key in dict_to_access:\n print('Store results of key: ', key)\n\n # store test_results:\n if 'multivariate' not in key:\n multivariate_flag = False\n\n # get prediction_df:\n preds_df = preds_into_df(\n dict_to_access[key][0][2], dict_to_access[key][0][4], multivariate_flag, test_set_year)\n\n # create history dfs:\n hist_col_labels = [\n 'loss (mse)', 'mae', 'val_loss (mse)', 'val_mae']\n hist_encoder_df, hist_prediction_df = trainhistory_into_df(\n dict_to_access[key][0][8], dict_to_access[key][0][0], hist_col_labels)\n\n # create validation_df:\n valid_preds_df = dict_to_access[key][0][5]\n valid_preds_df = pd.DataFrame(valid_preds_df)\n # store actual values of original dataset\n valid_preds_df['actuals'] = dict_to_access[key][0][4].loc[validation_set_year]\n\n else:\n multivariate_flag = True\n # get prediction_df:\n preds_df = preds_into_df(\n dict_to_access[key][0][2], dict_to_access[key][0][4], multivariate_flag, test_set_year)\n\n # create history dfs:\n hist_col_labels = [\n 'loss (mse)', 'mae', 'val_loss (mse)', 'val_mae']\n hist_encoder_df, hist_prediction_df = trainhistory_into_df(\n dict_to_access[key][0][8], dict_to_access[key][0][0], hist_col_labels)\n\n # create validation_df for multivariate case:\n valid_preds_df = dict_to_access[key][0][5]\n\n # create filenames:\n results_file = 'results_' + dict_to_access[key][0][3] + '.csv'\n history_encoder_file = 'history_autoencoder_' + \\\n dict_to_access[key][0][3] + '.csv'\n history_predict_file = 'history_predict_' + \\\n dict_to_access[key][0][3] + '.csv'\n valid_results_file = 'validation_results_' + \\\n dict_to_access[key][0][3] + '.csv'\n\n # store test results:\n store_preds_and_trainhistory_on_disk(preds_df, hist_encoder_df, hist_prediction_df,\n results_file, history_encoder_file, history_predict_file, Df_Store_PATH)\n # store validation results:\n # get path where to store df:\n valid_df_final_path = os.path.join(Df_Store_PATH, valid_results_file)\n # store df:\n valid_preds_df.to_csv(valid_df_final_path, header=True)\n\n # store model architecture (architecture + weights):\n autoencoder_model_architecture_file = 'autoencoder_model_' + \\\n dict_to_access[key][0][3] + '.json'\n autoencoder_model_weights_file = 'autoencoder_model_' + \\\n dict_to_access[key][0][3] + '_weights.h5'\n predict_model_architecture_file = 'predict_model_' + \\\n dict_to_access[key][0][3] + '.json'\n predict_model_weights_file = 'predict_model_' + \\\n dict_to_access[key][0][3] + '_weights.h5'\n\n # call function to save models:\n # save autoencoder model:\n save_models_to_json(autoencoder_model_architecture_file,\n autoencoder_model_weights_file, Model_Save_PATH, dict_to_access[key][0][9])\n # save predict model:\n save_models_to_json(predict_model_architecture_file,\n predict_model_weights_file, Model_Save_PATH, dict_to_access[key][0][1])\n\n # store RMSE results of models:\n # create dict to store RMSE results:\n dict_test_rmse = {}\n dict_valid_rmse = {}\n\n # add values to dict:\n # Note: for multivar_models only avg. RMSE is stored!!\n if 'multivar' in key:\n dict_valid_rmse[key] = []\n dict_test_rmse[key] = []\n # append avg. RMSE results of multivar model:\n dict_valid_rmse[key].append(dict_to_access[key][0][7][0])\n dict_test_rmse[key].append(dict_to_access[key][0][6][0])\n\n else:\n dict_test_rmse[key] = []\n dict_valid_rmse[key] = []\n dict_valid_rmse[key].append(dict_to_access[key][0][7])\n dict_test_rmse[key].append(dict_to_access[key][0][6])\n\n # create Df from rmse_dicts:\n rmse_valid_df = pd.DataFrame.from_dict(dict_valid_rmse, orient='index')\n rmse_test_df = pd.DataFrame.from_dict(dict_test_rmse, orient='index')\n", "project_metadata": {"full_name": "vincekellner/demandforecasting", "description": "Forecast taxi demand for given areas in New York City", "topics": [], "git_url": "git://github.com/vincekellner/demandforecasting.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-01-20T15:40:46Z", "size": 8014, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3571286, "Python": 217989}, "last_updated": "2020-10-13T22:27:24Z"}, "intent": " # store results in one df:"}, {"original_comment": "# read df from pickle\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### WordCloud for tweet text retrieved for Mental Anxiety text keyword\n\n#%%\n\nfrom textblob import TextBlob\nimport TweetUtils\nimport numpy as np\nimport pandas as pd\nfrom os import path\nfrom wordcloud import WordCloud, STOPWORDS\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport json\nimport Utils\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ndef generate_word_cloud(data):\n data[\"full_text\"] = data[\"full_text\"].apply(\n lambda tweet: TweetUtils.process_tweet(tweet))\n all_tweets_text = \" \".join(tweet for tweet in data.loc[:, \"full_text\"])\n stopwords = set(STOPWORDS)\n stopwords.update([\"covid\", \"covid-19\", \"corona\", \"pandemic\"])\n # plt.figure(figsize=(800, 400))\n wordcloud = WordCloud(stopwords=stopwords,\n background_color=\"white\").generate(all_tweets_text)\n plt.imshow(wordcloud, interpolation='bilinear')\n plt.axis(\"off\")\n plt.show()\n\n#%%", "target_code": "import pandas as pd\n\nPICKLE_FILE = \"df_april_10Kph_All.pkl\"\ndf = pd.read_pickle(PICKLE_FILE)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### WordCloud for tweet text retrieved for Mental Anxiety text keyword\n\n\nfrom textblob import TextBlob\nimport TweetUtils\nimport numpy as np\nfrom os import path\nfrom wordcloud import WordCloud, STOPWORDS\nimport matplotlib.pyplot as plt\nimport json\nimport Utils\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ndef generate_word_cloud(data):\n data[\"full_text\"] = data[\"full_text\"].apply(\n lambda tweet: TweetUtils.process_tweet(tweet))\n all_tweets_text = \" \".join(tweet for tweet in data.loc[:, \"full_text\"])\n stopwords = set(STOPWORDS)\n stopwords.update([\"covid\", \"covid-19\", \"corona\", \"pandemic\"])\n # plt.figure(figsize=(800, 400))\n wordcloud = WordCloud(stopwords=stopwords,\n background_color=\"white\").generate(all_tweets_text)\n plt.imshow(wordcloud, interpolation='bilinear')\n plt.axis(\"off\")\n plt.show()\n\n", "project_metadata": {"full_name": "aiformankind/covid-19-hackathon", "description": null, "topics": [], "git_url": "git://github.com/aiformankind/covid-19-hackathon.git", "stars": 24, "watchers": 24, "forks": 25, "created": "2020-03-01T00:41:41Z", "size": 4288, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1959726, "Python": 4275}, "last_updated": "2020-11-19T21:24:45Z"}, "intent": "# read df from pickle"}, {"original_comment": "# new columns for rolling mean and std of sunspots\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time Series Decomposition & Analysis Assignment\n\n#%%\n\nfrom statsmodels.graphics.tsaplots import plot_acf\nimport pandas as pd\nimport statsmodels.api as sm\nimport plotly.express as px\n\n\n# ### Import the monthly sunspots data set into a Pandas dataframe, and convert the Month field to a datetime data type.\n\n#%%\n\n# read data\nsun = pd.read_csv(\n 'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%203/monthly-sunspots.csv')\nprint(sun.shape)\nsun.head()\n\n#%%\n\n# convert date to datetime\nsun['Month'] = pd.to_datetime(sun['Month'])\nsun.info()\n\n\n# ### Use a line chart to plot the time series.\n\n#%%\n\n# line chart def\ndef ilinechart(df, x, y, groups=None, title=''):\n fig = px.line(df, x=x, y=y, color=groups, title=title,\n template='none')\n fig.show()\n\n#%%\n\nilinechart(sun, 'Month', 'Sunspots', title='Month vs. Sunspots Chart')\n\n\n# ### Decompose the time series and add columns for the trend, seasonality, and residuals to the data set.\n\n#%%\n\n# new df with month as index\nseries = sun.set_index('Month')\n\n# decomposition\ndecomposition = sm.tsa.seasonal_decompose(\n series, model='additive', period=132) # sunspot freq=11yrs -- 11*12(months)\n\n# adding columns\ntrend = decomposition.trend.reset_index()\nseasonality = decomposition.seasonal.reset_index()\nresiduals = decomposition.resid.reset_index()\n\n# merging\nmerged = sun.merge(trend, on='Month')\nmerged = merged.merge(seasonality, on='Month')\nmerged = merged.merge(residuals, on='Month')\nmerged.head()\n\n\n# ### Plot the observed values, trend, seasonality, and residuals on a multi-line chart. You should have a line for each column.\n#\n# **Hint:** You may need to melt the data so that all the categories are in a single column and all the values are in a single column.\n\n#%%\n\n# melting columns to graph\nmelted = pd.melt(merged,\n id_vars='Month',\n value_vars=['Sunspots', 'trend', 'seasonal', 'resid'],\n var_name='Variables',\n value_name='Values')\nmelted.head()\n\n#%%\n\n# graphing melted df\nilinechart(melted, 'Month', 'Values', groups='Variables',\n title='Monthly Sunspots Observed vs. Components')\n\n\n# ### Add two columns to the data set - one that calculates a rolling mean and another that calculates a rolling standard deviation.\n\n#%%", "target_code": "merged['Rolling_mean'] = merged['Sunspots'].rolling(window=132).mean()\nmerged['Rolling_std'] = merged['Sunspots'].rolling(window=132).std()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time Series Decomposition & Analysis Assignment\n\n\nfrom statsmodels.graphics.tsaplots import plot_acf\nimport pandas as pd\nimport statsmodels.api as sm\nimport plotly.express as px\n\n\n# ### Import the monthly sunspots data set into a Pandas dataframe, and convert the Month field to a datetime data type.\n\n\n# read data\nsun = pd.read_csv(\n 'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Time%20Series%20Analysis/Time%20Series%20-%20Day%203/monthly-sunspots.csv')\nprint(sun.shape)\nsun.head()\n\n\n# convert date to datetime\nsun['Month'] = pd.to_datetime(sun['Month'])\nsun.info()\n\n\n# ### Use a line chart to plot the time series.\n\n\n# line chart def\ndef ilinechart(df, x, y, groups=None, title=''):\n fig = px.line(df, x=x, y=y, color=groups, title=title,\n template='none')\n fig.show()\n\n\nilinechart(sun, 'Month', 'Sunspots', title='Month vs. Sunspots Chart')\n\n\n# ### Decompose the time series and add columns for the trend, seasonality, and residuals to the data set.\n\n\n# new df with month as index\nseries = sun.set_index('Month')\n\n# decomposition\ndecomposition = sm.tsa.seasonal_decompose(\n series, model='additive', period=132) # sunspot freq=11yrs -- 11*12(months)\n\n# adding columns\ntrend = decomposition.trend.reset_index()\nseasonality = decomposition.seasonal.reset_index()\nresiduals = decomposition.resid.reset_index()\n\n# merging\nmerged = sun.merge(trend, on='Month')\nmerged = merged.merge(seasonality, on='Month')\nmerged = merged.merge(residuals, on='Month')\nmerged.head()\n\n\n# ### Plot the observed values, trend, seasonality, and residuals on a multi-line chart. You should have a line for each column.\n#\n# **Hint:** You may need to melt the data so that all the categories are in a single column and all the values are in a single column.\n\n\n# melting columns to graph\nmelted = pd.melt(merged,\n id_vars='Month',\n value_vars=['Sunspots', 'trend', 'seasonal', 'resid'],\n var_name='Variables',\n value_name='Values')\nmelted.head()\n\n\n# graphing melted df\nilinechart(melted, 'Month', 'Values', groups='Variables',\n title='Monthly Sunspots Observed vs. Components')\n\n\n# ### Add two columns to the data set - one that calculates a rolling mean and another that calculates a rolling standard deviation.\n\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# new columns for rolling mean and std of sunspots"}, {"original_comment": "# Iterate over the first 3 records and decode them.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ##### Copyright © 2019 The TensorFlow Authors.\n\n#%%\n\n# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n# # TFX Estimator Component Tutorial\n#\n# ***A Component-by-Component Introduction to TensorFlow Extended (TFX)***\n\n# ## Background\n# This notebook demonstrates how to use TFX in a Jupyter/Colab environment. Here, we walk through the Chicago Taxi example in an interactive notebook.\n#\n# Working in an interactive notebook is a useful way to become familiar with the structure of a TFX pipeline. It's also useful when doing development of your own pipelines as a lightweight development environment, but you should be aware that there are differences in the way interactive notebooks are orchestrated, and how they access metadata artifacts.\n#\n# ### Orchestration\n#\n# In a production deployment of TFX, you will use an orchestrator such as Apache Airflow, Kubeflow Pipelines, or Apache Beam to orchestrate a pre-defined pipeline graph of TFX components. In an interactive notebook, the notebook itself is the orchestrator, running each TFX component as you execute the notebook cells.\n#\n# ### Metadata\n#\n# In a production deployment of TFX, you will access metadata through the ML Metadata (MLMD) API. MLMD stores metadata properties in a database such as MySQL or SQLite, and stores the metadata payloads in a persistent store such as on your filesystem. In an interactive notebook, both properties and payloads are stored in an ephemeral SQLite database in the `/tmp` directory on the Jupyter notebook or Colab server.\n\n# ## Setup\n# First, we install and import the necessary packages, set up paths, and download data.\n\n#%%\n\nfrom tfx.types.standard_artifacts import ModelBlessing\nfrom tfx.types.standard_artifacts import Model\nfrom tfx.types import Channel\nfrom tfx.utils.dsl_utils import external_input\nfrom tfx.proto.evaluator_pb2 import SingleSlicingSpec\nfrom tfx.proto import trainer_pb2\nfrom tfx.proto import pusher_pb2\nfrom tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\nfrom tfx.orchestration import pipeline\nfrom tfx.orchestration import metadata\nfrom tfx.dsl.experimental import latest_blessed_model_resolver\nfrom tfx.components import Transform\nfrom tfx.components import Trainer\nfrom tfx.components import StatisticsGen\nfrom tfx.components import SchemaGen\nfrom tfx.components import ResolverNode\nfrom tfx.components import Pusher\nfrom tfx.components import ExampleValidator\nfrom tfx.components import Evaluator\nfrom tfx.components import CsvExampleGen\nimport tfx\nimport tensorflow_model_analysis as tfma\nimport tensorflow as tf\nimport absl\nimport urllib\nimport tempfile\nimport pprint\nimport os\nget_ipython().system('pip install -q -U --use-feature=2020-resolver tfx')\n\n\n# ## Did you restart the runtime?\n#\n# If you are using Google Colab, the first time that you run the cell above, you must restart the runtime (Runtime > Restart runtime ...). This is because of the way that Colab loads packages.\n\n# ### Import packages\n# We import necessary packages, including standard TFX component classes.\n\n#%%\n\ntf.get_logger().propagate = False\npp = pprint.PrettyPrinter()\n\n\nget_ipython().run_line_magic('load_ext',\n 'tfx.orchestration.experimental.interactive.notebook_extensions.skip')\n\n\n# Let's check the library versions.\n\n#%%\n\nprint('TensorFlow version: {}'.format(tf.__version__))\nprint('TFX version: {}'.format(tfx.__version__))\n\n\n# ### Set up pipeline paths\n\n#%%\n\n# This is the root directory for your TFX pip package installation.\n_tfx_root = tfx.__path__[0]\n\n# This is the directory containing the TFX Chicago Taxi Pipeline example.\n_taxi_root = os.path.join(_tfx_root, 'examples/chicago_taxi_pipeline')\n\n# This is the path where your model will be pushed for serving.\n_serving_model_dir = os.path.join(\n tempfile.mkdtemp(), 'serving_model/taxi_simple')\n\n# Set up logging.\nabsl.logging.set_verbosity(absl.logging.INFO)\n\n\n# ### Download example data\n# We download the example dataset for use in our TFX pipeline.\n#\n# The dataset we're using is the [Taxi Trips dataset](https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew) released by the City of Chicago. The columns in this dataset are:\n#\n# \n# \n# \n# \n# \n# \n# \n#
pickup_community_areafaretrip_start_month
trip_start_hourtrip_start_daytrip_start_timestamp
pickup_latitudepickup_longitudedropoff_latitude
dropoff_longitudetrip_milespickup_census_tract
dropoff_census_tractpayment_typecompany
trip_secondsdropoff_community_areatips
\n#\n# With this dataset, we will build a model that predicts the `tips` of a trip.\n\n#%%\n\n_data_root = tempfile.mkdtemp(prefix='tfx-data')\nDATA_PATH = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv'\n_data_filepath = os.path.join(_data_root, \"data.csv\")\nurllib.request.urlretrieve(DATA_PATH, _data_filepath)\n\n\n# Take a quick look at the CSV file.\n\n#%%\n\nget_ipython().system('head {_data_filepath}')\n\n\n# *Disclaimer: This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of any of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk.*\n\n# ### Create the InteractiveContext\n# Last, we create an InteractiveContext, which will allow us to run TFX components interactively in this notebook.\n\n#%%\n\n# Here, we create an InteractiveContext using default parameters. This will\n# use a temporary directory with an ephemeral ML Metadata database instance.\n# To use your own pipeline root or database, the optional properties\n# `pipeline_root` and `metadata_connection_config` may be passed to\n# InteractiveContext. Calls to InteractiveContext are no-ops outside of the\n# notebook.\ncontext = InteractiveContext()\n\n\n# ## Run TFX components interactively\n# In the cells that follow, we create TFX components one-by-one, run each of them, and visualize their output artifacts.\n\n# ### ExampleGen\n#\n# The `ExampleGen` component is usually at the start of a TFX pipeline. It will:\n#\n# 1. Split data into training and evaluation sets (by default, 2/3 training + 1/3 eval)\n# 2. Convert data into the `tf.Example` format\n# 3. Copy data into the `_tfx_root` directory for other components to access\n#\n# `ExampleGen` takes as input the path to your data source. In our case, this is the `_data_root` path that contains the downloaded CSV.\n#\n# Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the \"Export to Pipeline\" section).\n\n#%%\n\nexample_gen = CsvExampleGen(input=external_input(_data_root))\ncontext.run(example_gen)\n\n\n# Let's examine the output artifacts of `ExampleGen`. This component produces two artifacts, training examples and evaluation examples:\n\n#%%\n\nartifact = example_gen.outputs['examples'].get()[0]\nprint(artifact.split_names, artifact.uri)\n\n\n# We can also take a look at the first three training examples:\n\n#%%\n\n# Get the URI of the output artifact representing the training examples, which is a directory\ntrain_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'train')\n\n# Get the list of files in this directory (all compressed TFRecord files)\ntfrecord_filenames = [os.path.join(train_uri, name)\n for name in os.listdir(train_uri)]\n\n# Create a `TFRecordDataset` to read these files\ndataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type=\"GZIP\")", "target_code": "for tfrecord in dataset.take(3):\n serialized_example = tfrecord.numpy()\n example = tf.train.Example()\n example.ParseFromString(serialized_example)\n pp.pprint(example)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ##### Copyright © 2019 The TensorFlow Authors.\n\n\n# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n# # TFX Estimator Component Tutorial\n#\n# ***A Component-by-Component Introduction to TensorFlow Extended (TFX)***\n\n# ## Background\n# This notebook demonstrates how to use TFX in a Jupyter/Colab environment. Here, we walk through the Chicago Taxi example in an interactive notebook.\n#\n# Working in an interactive notebook is a useful way to become familiar with the structure of a TFX pipeline. It's also useful when doing development of your own pipelines as a lightweight development environment, but you should be aware that there are differences in the way interactive notebooks are orchestrated, and how they access metadata artifacts.\n#\n# ### Orchestration\n#\n# In a production deployment of TFX, you will use an orchestrator such as Apache Airflow, Kubeflow Pipelines, or Apache Beam to orchestrate a pre-defined pipeline graph of TFX components. In an interactive notebook, the notebook itself is the orchestrator, running each TFX component as you execute the notebook cells.\n#\n# ### Metadata\n#\n# In a production deployment of TFX, you will access metadata through the ML Metadata (MLMD) API. MLMD stores metadata properties in a database such as MySQL or SQLite, and stores the metadata payloads in a persistent store such as on your filesystem. In an interactive notebook, both properties and payloads are stored in an ephemeral SQLite database in the `/tmp` directory on the Jupyter notebook or Colab server.\n\n# ## Setup\n# First, we install and import the necessary packages, set up paths, and download data.\n\n\nfrom tfx.types.standard_artifacts import ModelBlessing\nfrom tfx.types.standard_artifacts import Model\nfrom tfx.types import Channel\nfrom tfx.utils.dsl_utils import external_input\nfrom tfx.proto.evaluator_pb2 import SingleSlicingSpec\nfrom tfx.proto import trainer_pb2\nfrom tfx.proto import pusher_pb2\nfrom tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\nfrom tfx.orchestration import pipeline\nfrom tfx.orchestration import metadata\nfrom tfx.dsl.experimental import latest_blessed_model_resolver\nfrom tfx.components import Transform\nfrom tfx.components import Trainer\nfrom tfx.components import StatisticsGen\nfrom tfx.components import SchemaGen\nfrom tfx.components import ResolverNode\nfrom tfx.components import Pusher\nfrom tfx.components import ExampleValidator\nfrom tfx.components import Evaluator\nfrom tfx.components import CsvExampleGen\nimport tfx\nimport tensorflow_model_analysis as tfma\nimport tensorflow as tf\nimport absl\nimport urllib\nimport tempfile\nimport pprint\nimport os\nget_ipython().system('pip install -q -U --use-feature=2020-resolver tfx')\n\n\n# ## Did you restart the runtime?\n#\n# If you are using Google Colab, the first time that you run the cell above, you must restart the runtime (Runtime > Restart runtime ...). This is because of the way that Colab loads packages.\n\n# ### Import packages\n# We import necessary packages, including standard TFX component classes.\n\n\ntf.get_logger().propagate = False\npp = pprint.PrettyPrinter()\n\n\nget_ipython().run_line_magic('load_ext',\n 'tfx.orchestration.experimental.interactive.notebook_extensions.skip')\n\n\n# Let's check the library versions.\n\n\nprint('TensorFlow version: {}'.format(tf.__version__))\nprint('TFX version: {}'.format(tfx.__version__))\n\n\n# ### Set up pipeline paths\n\n\n# This is the root directory for your TFX pip package installation.\n_tfx_root = tfx.__path__[0]\n\n# This is the directory containing the TFX Chicago Taxi Pipeline example.\n_taxi_root = os.path.join(_tfx_root, 'examples/chicago_taxi_pipeline')\n\n# This is the path where your model will be pushed for serving.\n_serving_model_dir = os.path.join(\n tempfile.mkdtemp(), 'serving_model/taxi_simple')\n\n# Set up logging.\nabsl.logging.set_verbosity(absl.logging.INFO)\n\n\n# ### Download example data\n# We download the example dataset for use in our TFX pipeline.\n#\n# The dataset we're using is the [Taxi Trips dataset](https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew) released by the City of Chicago. The columns in this dataset are:\n#\n# \n# \n# \n# \n# \n# \n# \n#
pickup_community_areafaretrip_start_month
trip_start_hourtrip_start_daytrip_start_timestamp
pickup_latitudepickup_longitudedropoff_latitude
dropoff_longitudetrip_milespickup_census_tract
dropoff_census_tractpayment_typecompany
trip_secondsdropoff_community_areatips
\n#\n# With this dataset, we will build a model that predicts the `tips` of a trip.\n\n\n_data_root = tempfile.mkdtemp(prefix='tfx-data')\nDATA_PATH = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv'\n_data_filepath = os.path.join(_data_root, \"data.csv\")\nurllib.request.urlretrieve(DATA_PATH, _data_filepath)\n\n\n# Take a quick look at the CSV file.\n\n\nget_ipython().system('head {_data_filepath}')\n\n\n# *Disclaimer: This site provides applications using data that has been modified for use from its original source, www.cityofchicago.org, the official website of the City of Chicago. The City of Chicago makes no claims as to the content, accuracy, timeliness, or completeness of any of the data provided at this site. The data provided at this site is subject to change at any time. It is understood that the data provided at this site is being used at one\u2019s own risk.*\n\n# ### Create the InteractiveContext\n# Last, we create an InteractiveContext, which will allow us to run TFX components interactively in this notebook.\n\n\n# Here, we create an InteractiveContext using default parameters. This will\n# use a temporary directory with an ephemeral ML Metadata database instance.\n# To use your own pipeline root or database, the optional properties\n# `pipeline_root` and `metadata_connection_config` may be passed to\n# InteractiveContext. Calls to InteractiveContext are no-ops outside of the\n# notebook.\ncontext = InteractiveContext()\n\n\n# ## Run TFX components interactively\n# In the cells that follow, we create TFX components one-by-one, run each of them, and visualize their output artifacts.\n\n# ### ExampleGen\n#\n# The `ExampleGen` component is usually at the start of a TFX pipeline. It will:\n#\n# 1. Split data into training and evaluation sets (by default, 2/3 training + 1/3 eval)\n# 2. Convert data into the `tf.Example` format\n# 3. Copy data into the `_tfx_root` directory for other components to access\n#\n# `ExampleGen` takes as input the path to your data source. In our case, this is the `_data_root` path that contains the downloaded CSV.\n#\n# Note: In this notebook, we can instantiate components one-by-one and run them with `InteractiveContext.run()`. By contrast, in a production setting, we would specify all the components upfront in a `Pipeline` to pass to the orchestrator (see the \"Export to Pipeline\" section).\n\n\nexample_gen = CsvExampleGen(input=external_input(_data_root))\ncontext.run(example_gen)\n\n\n# Let's examine the output artifacts of `ExampleGen`. This component produces two artifacts, training examples and evaluation examples:\n\n\nartifact = example_gen.outputs['examples'].get()[0]\nprint(artifact.split_names, artifact.uri)\n\n\n# We can also take a look at the first three training examples:\n\n\n# Get the URI of the output artifact representing the training examples, which is a directory\ntrain_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'train')\n\n# Get the list of files in this directory (all compressed TFRecord files)\ntfrecord_filenames = [os.path.join(train_uri, name)\n for name in os.listdir(train_uri)]\n\n# Create a `TFRecordDataset` to read these files\ndataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type=\"GZIP\")\n", "project_metadata": {"full_name": "luotigerlsx/kubeflow-pipeline-fantasy", "description": "Tutorials, Examples about Kubeflow Pipeline.", "topics": ["kubeflow-pipelines", "pipeline-component", "kubeflow-cluster"], "git_url": "git://github.com/luotigerlsx/kubeflow-pipeline-fantasy.git", "stars": 6, "watchers": 6, "forks": 1, "created": "2019-11-04T10:24:00Z", "size": 1666, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 656994, "Python": 111694, "Shell": 6200, "Dockerfile": 861}, "last_updated": "2020-12-29T19:34:00Z"}, "intent": "# Iterate over the first 3 records and decode them."}, {"original_comment": "# Specify the x-axis label\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')", "target_code": "plt.xlabel('Hours since midnight August 1, 2010')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "intent": "# Specify the x-axis label"}, {"original_comment": " # Get image size\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport time\nimport sys\nimport numba\nimport tensorflow as tf\nfrom numba import jit\nfrom torch.autograd import Variable\nimport numpy as np\nimport torch\nimport math\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n# Interpolation kernel\n\n\n@jit\ndef u(l1, l2, l3, l4, a):\n x1 = abs(l1)\n x2 = abs(l2)\n x3 = abs(l3)\n x4 = abs(l4)\n\n y1 = 0\n if (x1 >= 0) & (x1 <= 1):\n\n y1 = (a+2)*(x1**3)-(a+3)*(x1**2)+1\n elif (x1 > 1) & (x1 <= 2):\n y1 = a*(x1**3)-(5*a)*(x1**2)+(8*a)*x1-4*a\n\n y2 = 0\n if (x2 >= 0) & (x2 <= 1):\n\n y2 = (a+2)*(x2**3)-(a+3)*(x2**2)+1\n elif (x2 > 1) & (x2 <= 2):\n y2 = a*(x2**3)-(5*a)*(x2**2)+(8*a)*x2-4*a\n\n y3 = 0\n if (x3 >= 0) & (x3 <= 1):\n\n y3 = (a+2)*(x3**3)-(a+3)*(x3**2)+1\n elif (x3 > 1) & (x3 <= 2):\n y3 = a*(x3**3)-(5*a)*(x3**2)+(8*a)*x3-4*a\n\n y4 = 0\n if (x4 >= 0) & (x4 <= 1):\n\n y4 = (a+2)*(x4**3)-(a+3)*(x4**2)+1\n elif (x4 > 1) & (x4 <= 2):\n y4 = a*(x4**3)-(5*a)*(x4**2)+(8*a)*x4-4*a\n\n return y1, y2, y3, y4\n\n\n@jit\ndef d(mat):\n mat_l, mat_m, mat_r = mat\n d = np.dot(np.dot(mat_l, mat_m), mat_r)\n return d\n\n\n@jit\ndef mat(h, a, i, j, c, img):\n x, y = i * h + 2, j * h + 2\n\n fx = math.floor(x)\n x1 = 1 + x - fx\n x2 = x - fx\n x3 = fx + 1 - x\n x4 = fx + 2 - x\n\n fy = math.floor(y)\n y1 = 1 + y - fy\n y2 = y - fy\n y3 = fy + 1 - y\n y4 = fy + 2 - y\n\n ny1 = int(y-y1)\n ny2 = int(y-y2)\n ny3 = int(y+y3)\n ny4 = int(y+y4)\n nmx1 = int(x-x1)\n nmx2 = int(x-x2)\n npx3 = int(x+x3)\n npx4 = int(x+x4)\n\n mat_m = np.array([[img[0, c, ny1, nmx1], img[0, c, ny2, nmx1], img[0, c, ny3, nmx1], img[0, c, ny4, nmx1]],\n [img[0, c, ny1, nmx2], img[0, c, ny2, nmx2],\n img[0, c, ny3, nmx2], img[0, c, ny4, nmx2]],\n [img[0, c, ny1, npx3], img[0, c, ny2, npx3],\n img[0, c, ny3, npx3], img[0, c, ny4, npx3]],\n [img[0, c, ny1, npx4], img[0, c, ny2, npx4], img[0, c, ny3, npx4], img[0, c, ny4, npx4]]])\n\n p1, p2, p3, p4 = u(y1, y2, y3, y4, a)\n\n return np.array([[u(x1, x2, x3, x4, a)]]), mat_m, np.array([[p1], [p2], [p3], [p4]])\n\n\n# Paddnig\ndef padding(img, B, C, H, W):\n zimg = np.zeros([B, C, H+4, W+4])\n zimg[:, :C, 2:H+2, 2:W+2] = img\n # Pad the first/last two col and row\n zimg[:, :C, 2:H+2, 0:2] = img[:, :C, :, 0:1]\n zimg[:, :, H+2:H+4, 2:W+2] = img[:, :, H-1:H, :]\n zimg[:, :, 2:H+2, W+2:W+4] = img[:, :, :, W-1:W]\n zimg[:, :C, 0:2, 2:W+2] = img[:, :C, 0:1, :]\n # Pad the missing eight points\n zimg[0, :C, 0:2, 0:2] = img[0, :C, 0, 0]\n zimg[0, :C, H+2:H+4, 0:2] = img[0, :C, H-1, 0]\n zimg[0, :C, H+2:H+4, W+2:W+4] = img[0, :C, H-1, W-1]\n zimg[0, :C, 0:2, W+2:W+4] = img[0, :C, 0, W-1]\n return zimg\n\n# Bicubic operation\n\n\ndef bicubic(img):\n\n # Coefficient\n img = img.cpu().detach().numpy()\n a = -0.75\n\n # Get image size\n B, C, H, W = img.shape\n\n img = padding(img, B, C, H, W)\n\n # Create new image\n dH = 6\n dW = 6\n dst = np.zeros([B, C, dH, dW])\n\n h = 1/(6/H)\n\n for b in range(B):\n for c in range(C):\n for j in range(dH):\n for i in range(dW):\n\n dst[b, c, j, i] = d(mat(h, a, i, j, c, img))\n\n dst = torch.Tensor(dst) # .cuda()\n #dst = dst.type(torch.cuda.FloatTensor)\n #dst = Variable(dst, requires_grad=True).cuda()\n return dst\n\n\ntorchImg = torch.FloatTensor([[[[1, 1, 1], [2, 2, 2], [3, 3, 3]]], [\n [[1, 1, 1], [2, 2, 2], [3, 3, 3]]]])\n#torchImg = torchImg.permute(0, 2,3, 1)\nprint('torchImg', torchImg.shape)\n\ndst = bicubic(torchImg)\nprint(torch.Tensor(dst).shape)\nprint(dst)\n\n#%%\n\n\n\n#%%\n\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n\ndef bicubic(tensor, upsc_size=6, interp='bicubic', align_corners=False, name=None):\n\n tensor = tensor.permute(0, 2, 3, 1)\n\n tfTensor = tf.convert_to_tensor(tensor)\n\n bicubic = tf.image.resize_bicubic(\n tfTensor,\n (upsc_size, upsc_size),\n align_corners=False,\n name=None\n )\n a = tf.InteractiveSession()\n torchTensor = torch.from_numpy(bicubic.eval())\n a.close()\n torchTensor = torchTensor.permute(0, 3, 1, 2)\n\n #torchTensor = torchTensor.type(torch.cuda.FloatTensor)\n #torchTensor = Variable(torchTensor, requires_grad=True).cuda()\n\n return torchTensor\n\n\ntorchImg = torch.FloatTensor([[[[1, 1, 1], [2, 2, 2], [3, 3, 3]]], [\n [[1, 1, 1], [2, 2, 2], [3, 3, 3]]]])\n\nd = bicubic(torchImg)\nprint(d)\n\n#%%\n\n\n\n#%%\n\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n\n# Interpolation kernel\ndef u(s, a):\n if (abs(s) >= 0) & (abs(s) <= 1):\n return (a+2)*(abs(s)**3)-(a+3)*(abs(s)**2)+1\n elif (abs(s) > 1) & (abs(s) <= 2):\n return a*(abs(s)**3)-(5*a)*(abs(s)**2)+(8*a)*abs(s)-4*a\n return 0\n\n# Paddnig\n\n\ndef padding(img, B, C, H, W):\n zimg = np.zeros((B, C, H+4, W+4))\n zimg[:, :C, 2:H+2, 2:W+2] = img\n # Pad the first/last two col and row\n zimg[:, :C, 2:H+2, 0:2] = img[:, :C, :, 0:1]\n zimg[:, :, H+2:H+4, 2:W+2] = img[:, :, H-1:H, :]\n zimg[:, :, 2:H+2, W+2:W+4] = img[:, :, :, W-1:W]\n zimg[:, :C, 0:2, 2:W+2] = img[:, :C, 0:1, :]\n # Pad the missing eight points\n zimg[:, :C, 0:2, 0:2] = img[:, :C, 0, 0]\n zimg[:, :C, H+2:H+4, 0:2] = img[:, :C, H-1, 0]\n zimg[:, :C, H+2:H+4, W+2:W+4] = img[:, :C, H-1, W-1]\n zimg[:, :C, 0:2, W+2:W+4] = img[:, :C, 0, W-1]\n return zimg\n\n# https://github.com/yunabe/codelab/blob/master/misc/terminal_progressbar/progress.py\n\n\ndef get_progressbar_str(progress):\n END = 170\n MAX_LEN = 30\n BAR_LEN = int(MAX_LEN * progress)\n return ('Progress:[' + '=' * BAR_LEN +\n ('>' if BAR_LEN < MAX_LEN else '') +\n ' ' * (MAX_LEN - BAR_LEN) +\n '] %.1f%%' % (progress * 100.))\n\n# Bicubic operation\n\n\ndef bicubic(img, ratio, a):\n\n img = np.array(img)", "target_code": " B, C, H, W = img.shape\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport time\nimport sys\nimport numba\nimport tensorflow as tf\nfrom numba import jit\nfrom torch.autograd import Variable\nimport numpy as np\nimport torch\nimport math\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n# Interpolation kernel\n\n\n@jit\ndef u(l1, l2, l3, l4, a):\n x1 = abs(l1)\n x2 = abs(l2)\n x3 = abs(l3)\n x4 = abs(l4)\n\n y1 = 0\n if (x1 >= 0) & (x1 <= 1):\n\n y1 = (a+2)*(x1**3)-(a+3)*(x1**2)+1\n elif (x1 > 1) & (x1 <= 2):\n y1 = a*(x1**3)-(5*a)*(x1**2)+(8*a)*x1-4*a\n\n y2 = 0\n if (x2 >= 0) & (x2 <= 1):\n\n y2 = (a+2)*(x2**3)-(a+3)*(x2**2)+1\n elif (x2 > 1) & (x2 <= 2):\n y2 = a*(x2**3)-(5*a)*(x2**2)+(8*a)*x2-4*a\n\n y3 = 0\n if (x3 >= 0) & (x3 <= 1):\n\n y3 = (a+2)*(x3**3)-(a+3)*(x3**2)+1\n elif (x3 > 1) & (x3 <= 2):\n y3 = a*(x3**3)-(5*a)*(x3**2)+(8*a)*x3-4*a\n\n y4 = 0\n if (x4 >= 0) & (x4 <= 1):\n\n y4 = (a+2)*(x4**3)-(a+3)*(x4**2)+1\n elif (x4 > 1) & (x4 <= 2):\n y4 = a*(x4**3)-(5*a)*(x4**2)+(8*a)*x4-4*a\n\n return y1, y2, y3, y4\n\n\n@jit\ndef d(mat):\n mat_l, mat_m, mat_r = mat\n d = np.dot(np.dot(mat_l, mat_m), mat_r)\n return d\n\n\n@jit\ndef mat(h, a, i, j, c, img):\n x, y = i * h + 2, j * h + 2\n\n fx = math.floor(x)\n x1 = 1 + x - fx\n x2 = x - fx\n x3 = fx + 1 - x\n x4 = fx + 2 - x\n\n fy = math.floor(y)\n y1 = 1 + y - fy\n y2 = y - fy\n y3 = fy + 1 - y\n y4 = fy + 2 - y\n\n ny1 = int(y-y1)\n ny2 = int(y-y2)\n ny3 = int(y+y3)\n ny4 = int(y+y4)\n nmx1 = int(x-x1)\n nmx2 = int(x-x2)\n npx3 = int(x+x3)\n npx4 = int(x+x4)\n\n mat_m = np.array([[img[0, c, ny1, nmx1], img[0, c, ny2, nmx1], img[0, c, ny3, nmx1], img[0, c, ny4, nmx1]],\n [img[0, c, ny1, nmx2], img[0, c, ny2, nmx2],\n img[0, c, ny3, nmx2], img[0, c, ny4, nmx2]],\n [img[0, c, ny1, npx3], img[0, c, ny2, npx3],\n img[0, c, ny3, npx3], img[0, c, ny4, npx3]],\n [img[0, c, ny1, npx4], img[0, c, ny2, npx4], img[0, c, ny3, npx4], img[0, c, ny4, npx4]]])\n\n p1, p2, p3, p4 = u(y1, y2, y3, y4, a)\n\n return np.array([[u(x1, x2, x3, x4, a)]]), mat_m, np.array([[p1], [p2], [p3], [p4]])\n\n\n# Paddnig\ndef padding(img, B, C, H, W):\n zimg = np.zeros([B, C, H+4, W+4])\n zimg[:, :C, 2:H+2, 2:W+2] = img\n # Pad the first/last two col and row\n zimg[:, :C, 2:H+2, 0:2] = img[:, :C, :, 0:1]\n zimg[:, :, H+2:H+4, 2:W+2] = img[:, :, H-1:H, :]\n zimg[:, :, 2:H+2, W+2:W+4] = img[:, :, :, W-1:W]\n zimg[:, :C, 0:2, 2:W+2] = img[:, :C, 0:1, :]\n # Pad the missing eight points\n zimg[0, :C, 0:2, 0:2] = img[0, :C, 0, 0]\n zimg[0, :C, H+2:H+4, 0:2] = img[0, :C, H-1, 0]\n zimg[0, :C, H+2:H+4, W+2:W+4] = img[0, :C, H-1, W-1]\n zimg[0, :C, 0:2, W+2:W+4] = img[0, :C, 0, W-1]\n return zimg\n\n# Bicubic operation\n\n\ndef bicubic(img):\n\n # Coefficient\n img = img.cpu().detach().numpy()\n a = -0.75\n\n # Get image size\n B, C, H, W = img.shape\n\n img = padding(img, B, C, H, W)\n\n # Create new image\n dH = 6\n dW = 6\n dst = np.zeros([B, C, dH, dW])\n\n h = 1/(6/H)\n\n for b in range(B):\n for c in range(C):\n for j in range(dH):\n for i in range(dW):\n\n dst[b, c, j, i] = d(mat(h, a, i, j, c, img))\n\n dst = torch.Tensor(dst) # .cuda()\n #dst = dst.type(torch.cuda.FloatTensor)\n #dst = Variable(dst, requires_grad=True).cuda()\n return dst\n\n\ntorchImg = torch.FloatTensor([[[[1, 1, 1], [2, 2, 2], [3, 3, 3]]], [\n [[1, 1, 1], [2, 2, 2], [3, 3, 3]]]])\n#torchImg = torchImg.permute(0, 2,3, 1)\nprint('torchImg', torchImg.shape)\n\ndst = bicubic(torchImg)\nprint(torch.Tensor(dst).shape)\nprint(dst)\n\n\n\n\n\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n\ndef bicubic(tensor, upsc_size=6, interp='bicubic', align_corners=False, name=None):\n\n tensor = tensor.permute(0, 2, 3, 1)\n\n tfTensor = tf.convert_to_tensor(tensor)\n\n bicubic = tf.image.resize_bicubic(\n tfTensor,\n (upsc_size, upsc_size),\n align_corners=False,\n name=None\n )\n a = tf.InteractiveSession()\n torchTensor = torch.from_numpy(bicubic.eval())\n a.close()\n torchTensor = torchTensor.permute(0, 3, 1, 2)\n\n #torchTensor = torchTensor.type(torch.cuda.FloatTensor)\n #torchTensor = Variable(torchTensor, requires_grad=True).cuda()\n\n return torchTensor\n\n\ntorchImg = torch.FloatTensor([[[[1, 1, 1], [2, 2, 2], [3, 3, 3]]], [\n [[1, 1, 1], [2, 2, 2], [3, 3, 3]]]])\n\nd = bicubic(torchImg)\nprint(d)\n\n\n\n\n\nget_ipython().run_line_magic('timeit', 'range(1000)')\n\n\n# Interpolation kernel\ndef u(s, a):\n if (abs(s) >= 0) & (abs(s) <= 1):\n return (a+2)*(abs(s)**3)-(a+3)*(abs(s)**2)+1\n elif (abs(s) > 1) & (abs(s) <= 2):\n return a*(abs(s)**3)-(5*a)*(abs(s)**2)+(8*a)*abs(s)-4*a\n return 0\n\n# Paddnig\n\n\ndef padding(img, B, C, H, W):\n zimg = np.zeros((B, C, H+4, W+4))\n zimg[:, :C, 2:H+2, 2:W+2] = img\n # Pad the first/last two col and row\n zimg[:, :C, 2:H+2, 0:2] = img[:, :C, :, 0:1]\n zimg[:, :, H+2:H+4, 2:W+2] = img[:, :, H-1:H, :]\n zimg[:, :, 2:H+2, W+2:W+4] = img[:, :, :, W-1:W]\n zimg[:, :C, 0:2, 2:W+2] = img[:, :C, 0:1, :]\n # Pad the missing eight points\n zimg[:, :C, 0:2, 0:2] = img[:, :C, 0, 0]\n zimg[:, :C, H+2:H+4, 0:2] = img[:, :C, H-1, 0]\n zimg[:, :C, H+2:H+4, W+2:W+4] = img[:, :C, H-1, W-1]\n zimg[:, :C, 0:2, W+2:W+4] = img[:, :C, 0, W-1]\n return zimg\n\n# https://github.com/yunabe/codelab/blob/master/misc/terminal_progressbar/progress.py\n\n\ndef get_progressbar_str(progress):\n END = 170\n MAX_LEN = 30\n BAR_LEN = int(MAX_LEN * progress)\n return ('Progress:[' + '=' * BAR_LEN +\n ('>' if BAR_LEN < MAX_LEN else '') +\n ' ' * (MAX_LEN - BAR_LEN) +\n '] %.1f%%' % (progress * 100.))\n\n# Bicubic operation\n\n\ndef bicubic(img, ratio, a):\n\n img = np.array(img)\n", "project_metadata": {"full_name": "arbitularov/PyTorch-Bicubic-interpolation", "description": "Bicubic interpolation for PyTorch", "topics": [], "git_url": "git://github.com/arbitularov/PyTorch-Bicubic-interpolation.git", "stars": 15, "watchers": 15, "forks": 0, "created": "2019-02-05T12:33:38Z", "size": 526, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16031, "Python": 10402}, "last_updated": "2020-07-23T22:52:02Z"}, "intent": " # Get image size"}, {"original_comment": "# Missing Values will be filled by \"U\" = Unknown\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n# ## MSDS422 Assignment 04\n#\n#
\n# More Technical: Throughout the notebook. This types of boxes provide more technical details and extra references about what you are seeing. They contain helpful tips, but you can safely skip them the first time you run through the code.\n#
\n\n# ### Data Dictionary Titanic Dataset\n#\n\n#
\n#\n# | Variable | Description | Details |\n# | :--- | :----: | :----: |\n# | Survival | Survival | 0 = No; 1 = Yes |\n# | pclass | Passenger Class | 1 = 1st; 2 = 2nd; 3 = 3rd |\n# | name | First and Last Name | |\n# | sex | Sex | |\n# | age | Age | |\n# | sibsp \t | Number of Siblings/Spouses Aboard | |\n# | parch | Number of Parents/Children Aboard | |\n# | ticket \t | Ticket Number \t | |\n# | fare \t | Passenger Fare \t | |\n# | cabin \t | Cabin \t | |\n# | embarked \t | Port of Embarkation \t |C = Cherbourg; Q = Queenstown; S = Southampton |\n#\n\n# ## Import packages\n#\n#\n\n#%%\n\nimport warnings\nimport numpy as np\nimport pandas as pd\nfrom math import sqrt\n\nimport statsmodels.formula.api as sm\nfrom xgboost import XGBClassifier\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.ensemble import RandomForestClassifier # Random Forest package\nfrom sklearn.ensemble import ExtraTreesClassifier # Extra Trees package\n\nfrom sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score, roc_auc_score, roc_curve\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import train_test_split\n\nimport scikitplot as skplt\nimport seaborn as sns\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\n\nsns.set_style(\"whitegrid\")\nsns.set(style=\"whitegrid\", color_codes=True)\nplt.rc(\"font\", size=14)\nnp.random.seed(42)\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nnp.set_printoptions(precision=4)\n\n\n#
\n# Suppress warning messages
\n\n#%%\n\ndef warn(*args, **kwargs):\n pass\n\n\nwarnings.warn = warn\n\n\n# ### Mount Google Drive to Colab Enviorment\n\n#%%\n\n#from google.colab import drive\n# drive.mount('/content/gdrive')\n\n\n# ### Load Data (Local Directory)\n\n#%%\n\ntraining_df = pd.read_csv(\"./data/MSDS422_04_train.csv\")\ntest_df = pd.read_csv(\"./data/MSDS422_04_test.csv\")\n\n\n# ### Data Quality Review\n\n#%%\n\nprint(\"Shape:\", training_df.shape, \"\\n\")\nprint(\"Variable Types:\")\nprint(training_df.dtypes)\n\n#%%\n\nprint(\"Shape:\", test_df.shape, \"\\n\")\nprint(\"Variable Types:\")\nprint(test_df.dtypes)\n\n#%%\n\ntraining_df.head()\n\n\n# ## Exploritory Data Analysis (EDA)\n\n# ### Review Training and Test Dataset for Missing Values\n\n#%%\n\ntraining_df.isnull().sum()\n\n#%%\n\ntest_df.isnull().sum()\n\n\n#
\n# Attention to the count row for each column number of records\n#
\n\n#%%\n\ntraining_df.describe()\n\n#%%\n\nsns.barplot(x=\"Sex\", y=\"Survived\", data=training_df)\nplt.title(\"Distribution of Survival based on Gender\")\nplt.show()\n\ntotal_survived_females = training_df[training_df.Sex ==\n \"female\"][\"Survived\"].sum()\ntotal_survived_males = training_df[training_df.Sex == \"male\"][\"Survived\"].sum()\n\nprint(\"Total people survived is: \" +\n str((total_survived_females + total_survived_males)))\nprint(\"Proportion of Females who survived:\")\nprint(total_survived_females/(total_survived_females + total_survived_males))\nprint(\"Proportion of Males who survived:\")\nprint(total_survived_males/(total_survived_females + total_survived_males))\n\n#%%\n\nsns.barplot(x=\"Pclass\", y=\"Survived\", data=training_df)\nplt.ylabel(\"Survival Rate\")\nplt.title(\"Distribution of Survival Based on Class\")\nplt.show()\n\ntotal_survived_one = training_df[training_df.Pclass == 1][\"Survived\"].sum()\ntotal_survived_two = training_df[training_df.Pclass == 2][\"Survived\"].sum()\ntotal_survived_three = training_df[training_df.Pclass == 3][\"Survived\"].sum()\ntotal_survived_class = total_survived_one + \\\n total_survived_two + total_survived_three\n\nprint(\"Total people survived is: \" + str(total_survived_class))\nprint(\"Proportion of Class 1 Passengers who survived:\")\nprint(total_survived_one/total_survived_class)\nprint(\"Proportion of Class 2 Passengers who survived:\")\nprint(total_survived_two/total_survived_class)\nprint(\"Proportion of Class 3 Passengers who survived:\")\nprint(total_survived_three/total_survived_class)\n\n#%%\n\nsns.barplot(x=\"Sex\", y=\"Survived\", hue=\"Pclass\", data=training_df)\nplt.ylabel(\"Survival Rate\")\nplt.title(\"Survival Rates Based on Gender and Class\")\n\n\n# ### Correlation Heatmap\n\n#
\n# Seaborn Heatmap Documentation
\n# https://seaborn.pydata.org/generated/seaborn.heatmap.html\n#
\n\n#%%\n\nf, ax = plt.subplots(figsize=(10, 10))\nsns.heatmap(training_df.corr(), annot=True, linewidths=0.5, fmt='.2f', ax=ax)\n\n\n# ## Preprocess Data for Analysis\n\n# ### Fix Missing Values, Feature Engineering\n\n# #### Encode Gender Male = 1, Female = 0\n\n#%%\n\nsex_map = {\"male\": 1, \"female\": 0}\ntraining_df[\"Sex\"] = training_df[\"Sex\"].map(sex_map)\ntest_df[\"Sex\"] = test_df[\"Sex\"].map(sex_map)\n\n#%%\n\ntraining_df[\"Sex\"].head()\n\n\n# Passenger Class (Pclass) and Number of Siblings (SibSp) provide insights into the social-economic status differences. The Age of the passenger will be imputed by median age of each sub-group\n\n# ### Training Dataset\n\n#%%\n\ntraining_df1 = training_df.groupby([\"Pclass\", \"SibSp\"])\ntraining_df1_median = training_df1.median()\ntraining_df1_median\n\n\n# ### Test Dataset\n\n#%%\n\ntest_df1 = test_df.groupby([\"Pclass\", \"SibSp\"])\ntest_df1_median = test_df1.median()\ntest_df1_median\n\n\n# ### Function - Impute Age\n\n#%%\n\ndef impute_age(dataset, dataset_med):\n for x in range(len(dataset)):\n if dataset[\"Pclass\"][x] == 1:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[1, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[1, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[1, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[1, 3][\"Age\"]\n elif dataset[\"Pclass\"][x] == 2:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[2, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[2, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[2, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[2, 3][\"Age\"]\n elif dataset[\"Pclass\"][x] == 3:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[3, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[3, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[3, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[3, 3][\"Age\"]\n elif dataset[\"SibSp\"][x] == 4:\n return dataset_med.loc[3, 4][\"Age\"]\n elif dataset[\"SibSp\"][x] == 5:\n return dataset_med.loc[3, 5][\"Age\"]\n elif dataset[\"SibSp\"][x] == 8:\n return dataset_med.loc[3][\"Age\"].median()\n\n#%%\n\ntraining_df[\"Age\"] = training_df[\"Age\"].fillna(\n impute_age(training_df, training_df1_median))\ntest_df[\"Age\"] = test_df[\"Age\"].fillna(impute_age(test_df, test_df1_median))\n\n#%%\n\nprint(training_df.isnull().sum()), print(test_df.isnull().sum())\n\n\n# ### Fix Missing Values Cabin", "target_code": "training_df[\"Cabin\"] = training_df[\"Cabin\"].fillna(\"U\")\ntest_df[\"Cabin\"] = test_df[\"Cabin\"].fillna(\"U\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n# ## MSDS422 Assignment 04\n#\n#
\n# More Technical: Throughout the notebook. This types of boxes provide more technical details and extra references about what you are seeing. They contain helpful tips, but you can safely skip them the first time you run through the code.\n#
\n\n# ### Data Dictionary Titanic Dataset\n#\n\n#
\n#\n# | Variable | Description | Details |\n# | :--- | :----: | :----: |\n# | Survival | Survival | 0 = No; 1 = Yes |\n# | pclass | Passenger Class | 1 = 1st; 2 = 2nd; 3 = 3rd |\n# | name | First and Last Name | |\n# | sex | Sex | |\n# | age | Age | |\n# | sibsp \t | Number of Siblings/Spouses Aboard | |\n# | parch | Number of Parents/Children Aboard | |\n# | ticket \t | Ticket Number \t | |\n# | fare \t | Passenger Fare \t | |\n# | cabin \t | Cabin \t | |\n# | embarked \t | Port of Embarkation \t |C = Cherbourg; Q = Queenstown; S = Southampton |\n#\n\n# ## Import packages\n#\n#\n\n\nimport warnings\nimport numpy as np\nimport pandas as pd\nfrom math import sqrt\n\nimport statsmodels.formula.api as sm\nfrom xgboost import XGBClassifier\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.ensemble import RandomForestClassifier # Random Forest package\nfrom sklearn.ensemble import ExtraTreesClassifier # Extra Trees package\n\nfrom sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score, roc_auc_score, roc_curve\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import train_test_split\n\nimport scikitplot as skplt\nimport seaborn as sns\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\n\nsns.set_style(\"whitegrid\")\nsns.set(style=\"whitegrid\", color_codes=True)\nplt.rc(\"font\", size=14)\nnp.random.seed(42)\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nnp.set_printoptions(precision=4)\n\n\n#
\n# Suppress warning messages
\n\n\ndef warn(*args, **kwargs):\n pass\n\n\nwarnings.warn = warn\n\n\n# ### Mount Google Drive to Colab Enviorment\n\n\n#from google.colab import drive\n# drive.mount('/content/gdrive')\n\n\n# ### Load Data (Local Directory)\n\n\ntraining_df = pd.read_csv(\"./data/MSDS422_04_train.csv\")\ntest_df = pd.read_csv(\"./data/MSDS422_04_test.csv\")\n\n\n# ### Data Quality Review\n\n\nprint(\"Shape:\", training_df.shape, \"\\n\")\nprint(\"Variable Types:\")\nprint(training_df.dtypes)\n\n\nprint(\"Shape:\", test_df.shape, \"\\n\")\nprint(\"Variable Types:\")\nprint(test_df.dtypes)\n\n\ntraining_df.head()\n\n\n# ## Exploritory Data Analysis (EDA)\n\n# ### Review Training and Test Dataset for Missing Values\n\n\ntraining_df.isnull().sum()\n\n\ntest_df.isnull().sum()\n\n\n#
\n# Attention to the count row for each column number of records\n#
\n\n\ntraining_df.describe()\n\n\nsns.barplot(x=\"Sex\", y=\"Survived\", data=training_df)\nplt.title(\"Distribution of Survival based on Gender\")\nplt.show()\n\ntotal_survived_females = training_df[training_df.Sex ==\n \"female\"][\"Survived\"].sum()\ntotal_survived_males = training_df[training_df.Sex == \"male\"][\"Survived\"].sum()\n\nprint(\"Total people survived is: \" +\n str((total_survived_females + total_survived_males)))\nprint(\"Proportion of Females who survived:\")\nprint(total_survived_females/(total_survived_females + total_survived_males))\nprint(\"Proportion of Males who survived:\")\nprint(total_survived_males/(total_survived_females + total_survived_males))\n\n\nsns.barplot(x=\"Pclass\", y=\"Survived\", data=training_df)\nplt.ylabel(\"Survival Rate\")\nplt.title(\"Distribution of Survival Based on Class\")\nplt.show()\n\ntotal_survived_one = training_df[training_df.Pclass == 1][\"Survived\"].sum()\ntotal_survived_two = training_df[training_df.Pclass == 2][\"Survived\"].sum()\ntotal_survived_three = training_df[training_df.Pclass == 3][\"Survived\"].sum()\ntotal_survived_class = total_survived_one + \\\n total_survived_two + total_survived_three\n\nprint(\"Total people survived is: \" + str(total_survived_class))\nprint(\"Proportion of Class 1 Passengers who survived:\")\nprint(total_survived_one/total_survived_class)\nprint(\"Proportion of Class 2 Passengers who survived:\")\nprint(total_survived_two/total_survived_class)\nprint(\"Proportion of Class 3 Passengers who survived:\")\nprint(total_survived_three/total_survived_class)\n\n\nsns.barplot(x=\"Sex\", y=\"Survived\", hue=\"Pclass\", data=training_df)\nplt.ylabel(\"Survival Rate\")\nplt.title(\"Survival Rates Based on Gender and Class\")\n\n\n# ### Correlation Heatmap\n\n#
\n# Seaborn Heatmap Documentation
\n# https://seaborn.pydata.org/generated/seaborn.heatmap.html\n#
\n\n\nf, ax = plt.subplots(figsize=(10, 10))\nsns.heatmap(training_df.corr(), annot=True, linewidths=0.5, fmt='.2f', ax=ax)\n\n\n# ## Preprocess Data for Analysis\n\n# ### Fix Missing Values, Feature Engineering\n\n# #### Encode Gender Male = 1, Female = 0\n\n\nsex_map = {\"male\": 1, \"female\": 0}\ntraining_df[\"Sex\"] = training_df[\"Sex\"].map(sex_map)\ntest_df[\"Sex\"] = test_df[\"Sex\"].map(sex_map)\n\n\ntraining_df[\"Sex\"].head()\n\n\n# Passenger Class (Pclass) and Number of Siblings (SibSp) provide insights into the social-economic status differences. The Age of the passenger will be imputed by median age of each sub-group\n\n# ### Training Dataset\n\n\ntraining_df1 = training_df.groupby([\"Pclass\", \"SibSp\"])\ntraining_df1_median = training_df1.median()\ntraining_df1_median\n\n\n# ### Test Dataset\n\n\ntest_df1 = test_df.groupby([\"Pclass\", \"SibSp\"])\ntest_df1_median = test_df1.median()\ntest_df1_median\n\n\n# ### Function - Impute Age\n\n\ndef impute_age(dataset, dataset_med):\n for x in range(len(dataset)):\n if dataset[\"Pclass\"][x] == 1:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[1, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[1, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[1, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[1, 3][\"Age\"]\n elif dataset[\"Pclass\"][x] == 2:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[2, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[2, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[2, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[2, 3][\"Age\"]\n elif dataset[\"Pclass\"][x] == 3:\n if dataset[\"SibSp\"][x] == 0:\n return dataset_med.loc[3, 0][\"Age\"]\n elif dataset[\"SibSp\"][x] == 1:\n return dataset_med.loc[3, 1][\"Age\"]\n elif dataset[\"SibSp\"][x] == 2:\n return dataset_med.loc[3, 2][\"Age\"]\n elif dataset[\"SibSp\"][x] == 3:\n return dataset_med.loc[3, 3][\"Age\"]\n elif dataset[\"SibSp\"][x] == 4:\n return dataset_med.loc[3, 4][\"Age\"]\n elif dataset[\"SibSp\"][x] == 5:\n return dataset_med.loc[3, 5][\"Age\"]\n elif dataset[\"SibSp\"][x] == 8:\n return dataset_med.loc[3][\"Age\"].median()\n\n\ntraining_df[\"Age\"] = training_df[\"Age\"].fillna(\n impute_age(training_df, training_df1_median))\ntest_df[\"Age\"] = test_df[\"Age\"].fillna(impute_age(test_df, test_df1_median))\n\n\nprint(training_df.isnull().sum()), print(test_df.isnull().sum())\n\n\n# ### Fix Missing Values Cabin\n\n\n\n", "project_metadata": {"full_name": "djp840/MSDS_422_Public", "description": null, "topics": [], "git_url": "git://github.com/djp840/MSDS_422_Public.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2020-09-01T17:39:51Z", "size": 29349, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 252678}, "last_updated": "2021-01-01T01:36:30Z"}, "intent": "# Missing Values will be filled by \"U\" = Unknown"}, {"original_comment": "# first let's re-see our dataset description\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Project: Medical Appointment No Shows\n#\n# Notebook based on notebook from kakgle: https://www.kaggle.com/kuroski/data-analyst-nanodegree-project-first-attempt/notebook\n#\n#\n# - wrangling the data\n# - make analysis based on my questions\n# - draw conclusions about my findings\n#\n#\n# ## Table of Contents\n# \n\n# \n# ## 1. Introduction\n#\n# This analysis consist in explore a dataset containing aproximately 100k medial appointments from the Brazilian public health system known as [SUS (Single Health System)](https://en.wikipedia.org/wiki/Sistema_%C3%9Anico_de_Sa%C3%BAde). We're gonna explore the [*no-show appointments dataset*](https://www.kaggle.com/joniarroba/noshowappointments) dataset using this variables:\n#\n# - **PatientId:** Identification of a patient\n# - **AppointmentID:** Identification of each appointment\n# - **Gender:** Male or Female\n# - **ScheduledDay:** The day of the actuall appointment, when they have to visit the doctor\n# - **AppointmentDay:** The day someone called or registered the appointment\n# - **Age:** How old is the patient\n# - **Neighbourhood:** Where the appointment takes place\n# - [**Scholarship:**](https://en.wikipedia.org/wiki/Bolsa_Fam%C3%ADlia) True or False, indicates if the patient is in the *Bolsa Familia* program - +500?\n# - **Hipertension:** True or False\n# - **Diabetes:** True or False\n# - **Alcoholism:** True or False\n# - **Handcap:** True or False\n# - **SMS_received:** 1 or more messages sent to the patient\n# - **No-show** \"No\" indicates if the patient showed up to their appointment and \"Yes\" if they didn't show up\n#\n# We're aiming to find possible reasons for patient no-showing at the scheduled appointments.\n\n#%%\n\n# first let's load our data\nfrom subprocess import call\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\"../input/KaggleV2-May-2016.csv\")\ndf.head(10)\n\n#%%\n\n# let's see from which period theese appointments are\ndf.AppointmentDay.min(), df.AppointmentDay.max()\n\n#%%\n\n# then let's see the shape of our data\ndf.shape\n\n#%%\n\n# and get general numeric attributes\ndf.describe()\n\n#%%\n\ndf['No-show'].value_counts()\n\n\n# \n# ### 1.1. First impressions\n#\n# First of all, we can notice that:\n# - We have 110k+ rows and 14 columns in this dataset\n# - I want to check if the patientId is duplicated since I don't know if they can have more than one appointment\n# - Depending on the analysis, maybe the **PatientId**, **AppointmentID** could be useless for the analysis\n# - It's important to highlight that the **No-show** column value **Yes** means that the pacient didn't show at the appointment\n#\n# Also we can already draw some assumptions:\n# - there are more than 100k appointments scheduled in a period of **~2 months** and that's really impressive\n# - most of the patients have about 37 yeas old with almost no scholarship (9%)\n# - in mean\n# - 19% of the patients have hipertension\n# - 7% of the patients have diabetes\n# - 3% of the patients suffers from alcoholism\n# - 32% of the patients recived SMS\n#\n# I'll analyse the **\"No-Show\"** as my dependent variable since to me it's the most important one and it's strongly related to what we want to discover here.\n#\n# And I'll use all dataset variables in my analysis, I just want to check on the **Data wrangling** stage if we really need the:\n# - Patient id\n# - Appointment id\n#\n# Since it look's like this variables are not that important for this analysis.\n#\n# \n# ### 1.2. Questions\n#\n# The main questions I want to answer are:\n# - Based on the variables we have, what is the most relevant factor that is influencing the patient to no showing the medical appointment?\n# - There are any relation on these variables that can lead us to a more specific group of patients?\n# - The day of the week of the appointment influences the patient no showing?\n# - The waiting time of the patient between the schedule day and the appointment day influences it's no showing?\n# - There are patients no showing in appointments on the same day?\n#\n# \n# ## 2. Data Wrangling\n#\n# In this section the data will be analysed and cleaned, dealing with missing values or weird values.\n# Also we're gonna explore in a deeper lever in a way that maybe we can formulate more questions.\n#\n# \n# ### 2.1. General Properties\n#\n# Here we're gonna explore our dataset properties checking for:\n# - What kind of variables we need to:\n# - convert the data type\n# - drop from the dataset\n# - Check for duplicates\n# - Check for weird values (outliers)\n# - Gather more information about a specific variable\n# - Check if we need to create more columns with usefull data for the exploration\n\n#%%\n\n# checking column information for missing values and strange types\ndf.info()\n\n#%%\n\n# checking for general data duplicates\ndf.duplicated().sum(), df.PatientId.duplicated(\n).sum(), df.AppointmentID.duplicated().sum()\n\n#%%\n\n# checking all possible values on some columns\nprint(df.Gender.unique())\nprint(sorted(df.Age.unique()))\nprint(sorted(df.Neighbourhood.unique()))\nprint(df.Scholarship.unique())\nprint(df.Hipertension.unique())\nprint(df.Diabetes.unique())\nprint(df.Alcoholism.unique())\nprint(df.Handcap.unique())\nprint(df.SMS_received.unique())\nprint(df['No-show'].unique())\n\n\n# It looks like we have a good dataset:\n# - no missing lines\n# - no duplicaded AppointmentID\n# - we discover that indeed we need the **Patient ID** since it seems some patients try to make new appointments\n# - there are no weird values on most columns\n#\n# But we'll need to do some cleaning here:\n# - we need to fix some data typings\n# - ScheduledDay and AppointmentDay makes sense to be a date/datetime type\n# - No-Show makes sense to be a boolean\n# - PatientId makes sense to be converted as string to prevent from being applied as a numerical operation since it represents the patient identification????\n# - Neighbourhood colulde be convert to caterogical but before - predictong\n# - **Appointment ID** seems to not be usefull for this analysis\n# - **Handcap** variable have values beyond True and False, and we can see [here](https://www.kaggle.com/joniarroba/noshowappointments/discussion/29699#229356) that this occurs because the handcap field represents the number of patient disabilities. But it could by catedgorize, eg. in Poland we heve 3 group of level handicap\n#\n# And we can also make more questions:\n# - On the problem summary they don't mention on which location we're analysing, I've thought that we're analysing data from several cities from Brazil, but it seams that we're seing a specific region\n# - We can see [here](https://www.kaggle.com/joniarroba/noshowappointments/discussion/38330#) that this dataset contains appointments from *Vit\u00f3ria - ES* in Brazil and this turns out to be even more impressive that there are more than 100k schedulled in a ~2 month period in 2016\n#\n# Now we can analyse also the neighbourhood data either!\n#\n# \n# ### 2.2. Data Cleaning\n#\n# Here we're need to:\n# - remove useless columns\n# - rename the columns to use easier names during the exploration while fixing typos\n# - format patient id column to string\n# - format all date related columns to the correct type\n# - remove the weird age value that is less than zero\n# - format the handicap field correctly, since we only want to know if the patient is handicap and not how many disabilities they have\n# - add new columns\n# - appointment_week_day: to show what day of the week the appointment was scheduled\n# - appointment_waiting_time: waiting time to the appointment\n\n#%%\n\n# let's remove some useless columns\n# I think the appointmentID is useless for this analysis\ndf.drop(['AppointmentID'], axis=1, inplace=True)\ndf.columns\n\n#%%\n\n# renaming all columns to simpler names for our exploration\ndf.rename(columns={'PatientId': 'patient_id', 'ScheduledDay': 'scheduled_day', 'AppointmentDay': 'appointment_day',\n 'SMS_received': 'received_sms', 'No-show': 'no_show', 'Handcap': 'handicap'}, inplace=True)\ndf.rename(columns=lambda x: x.lower(), inplace=True)\ndf.columns\n\n#%%\n\n# formatting the patient_id column as string\ndf.patient_id = df.patient_id.apply(lambda patient: str(int(patient)))\n\n#%%\n\n# formatting the date time 'scheduled_day' and 'appointment_day' columns\n# i'm just testing different forms of time conversion here\ndf.scheduled_day = pd.to_datetime(df.scheduled_day)\ndf.appointment_day = df.appointment_day.apply(np.datetime64)\n\ndf.scheduled_day.head(1), df.appointment_day.head(1)\n\n#%%\n\n# formatting the 'no_show' column with lower cases\ndf.no_show = df.no_show.map({'No': 'no', 'Yes': 'yes'})\n\ndf.no_show.unique()\n\n#%%\n\ndf.age.hist(figsize=(15, 8), bins=115)\n\n#%%\n\n# discart the ages bellow zero\ndf = df.query('age >= 0')\nprint(sorted(df.age.unique()))\n\n#%%\n\ndf['handicap'].unique()\n\n#%%\n\ndf['handicap'].value_counts()\n\n#%%\n\n# remove the weird values from handcap variable\ndf.loc[df.handicap > 1, 'handicap'] = 1\ndf.handicap.unique()\n\n#%%\n\n# creating the first column \"appointment_week_day\"\ndf['appointment_week_day'] = df.appointment_day.map(lambda day: day.day_name())\ndf.appointment_week_day.head(7000)\n\n#%%\n\n# solving problem \"DatetimeArray subtraction must have the same timezones or no timezones\"\n\n#df['appointment_day'] = df['appointment_day'].dt.tz_convert(None)\ndf['scheduled_day'] = df['scheduled_day'].dt.tz_convert(None)\n\n#%%\n\n# creating the second column \"appointment_waiting_time\"\ndf[\"appointment_waiting_days\"] = df.appointment_day - df.scheduled_day\ndf.appointment_waiting_days.head(10)\n\n#%%\n\n# well it seams that some are treated on the same day that they scheduled\n# we can prevent that weird value by calculating the the \"absolute value\" of this column\n# and then converting the \"time\" to \"days\"\ndf.appointment_waiting_days = df.appointment_waiting_days.abs().dt.days\ndf.appointment_waiting_days.head(10)\n\n#%%\n\n# let's see how our data looks like after all cleanning\ndf.head(5)\n\n\n# It seams we have all the data we need to start exploring and answer the questions.\n#\n# \n# ## 3. Exploratory Data Analysis\n#\n# First, let's review all questions that I want to answer:\n#\n# - Based on the variables we have, what is the most relevant factor that is influencing the patient to no showing the medical appointment?\n# - There are any relation on these variables that can lead us to a more specific group of patients?\n# - The day of the week of the appointment influences the patient no showing?\n# - The waiting time of the patient between the schedule day and the appointment day influences it's no showing?\n# - There are patients no showing in appointments on the same day?\n#\n# Let's:\n# - analyse our data\n# - mix them up\n# - get assumptions along the way\n# - answer our questions\n\n#%%", "target_code": "df.describe()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Project: Medical Appointment No Shows\n#\n# Notebook based on notebook from kakgle: https://www.kaggle.com/kuroski/data-analyst-nanodegree-project-first-attempt/notebook\n#\n#\n# - wrangling the data\n# - make analysis based on my questions\n# - draw conclusions about my findings\n#\n#\n# ## Table of Contents\n# \n\n# \n# ## 1. Introduction\n#\n# This analysis consist in explore a dataset containing aproximately 100k medial appointments from the Brazilian public health system known as [SUS (Single Health System)](https://en.wikipedia.org/wiki/Sistema_%C3%9Anico_de_Sa%C3%BAde). We're gonna explore the [*no-show appointments dataset*](https://www.kaggle.com/joniarroba/noshowappointments) dataset using this variables:\n#\n# - **PatientId:** Identification of a patient\n# - **AppointmentID:** Identification of each appointment\n# - **Gender:** Male or Female\n# - **ScheduledDay:** The day of the actuall appointment, when they have to visit the doctor\n# - **AppointmentDay:** The day someone called or registered the appointment\n# - **Age:** How old is the patient\n# - **Neighbourhood:** Where the appointment takes place\n# - [**Scholarship:**](https://en.wikipedia.org/wiki/Bolsa_Fam%C3%ADlia) True or False, indicates if the patient is in the *Bolsa Familia* program - +500?\n# - **Hipertension:** True or False\n# - **Diabetes:** True or False\n# - **Alcoholism:** True or False\n# - **Handcap:** True or False\n# - **SMS_received:** 1 or more messages sent to the patient\n# - **No-show** \"No\" indicates if the patient showed up to their appointment and \"Yes\" if they didn't show up\n#\n# We're aiming to find possible reasons for patient no-showing at the scheduled appointments.\n\n\n# first let's load our data\nfrom subprocess import call\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\"../input/KaggleV2-May-2016.csv\")\ndf.head(10)\n\n\n# let's see from which period theese appointments are\ndf.AppointmentDay.min(), df.AppointmentDay.max()\n\n\n# then let's see the shape of our data\ndf.shape\n\n\n# and get general numeric attributes\ndf.describe()\n\n\ndf['No-show'].value_counts()\n\n\n# \n# ### 1.1. First impressions\n#\n# First of all, we can notice that:\n# - We have 110k+ rows and 14 columns in this dataset\n# - I want to check if the patientId is duplicated since I don't know if they can have more than one appointment\n# - Depending on the analysis, maybe the **PatientId**, **AppointmentID** could be useless for the analysis\n# - It's important to highlight that the **No-show** column value **Yes** means that the pacient didn't show at the appointment\n#\n# Also we can already draw some assumptions:\n# - there are more than 100k appointments scheduled in a period of **~2 months** and that's really impressive\n# - most of the patients have about 37 yeas old with almost no scholarship (9%)\n# - in mean\n# - 19% of the patients have hipertension\n# - 7% of the patients have diabetes\n# - 3% of the patients suffers from alcoholism\n# - 32% of the patients recived SMS\n#\n# I'll analyse the **\"No-Show\"** as my dependent variable since to me it's the most important one and it's strongly related to what we want to discover here.\n#\n# And I'll use all dataset variables in my analysis, I just want to check on the **Data wrangling** stage if we really need the:\n# - Patient id\n# - Appointment id\n#\n# Since it look's like this variables are not that important for this analysis.\n#\n# \n# ### 1.2. Questions\n#\n# The main questions I want to answer are:\n# - Based on the variables we have, what is the most relevant factor that is influencing the patient to no showing the medical appointment?\n# - There are any relation on these variables that can lead us to a more specific group of patients?\n# - The day of the week of the appointment influences the patient no showing?\n# - The waiting time of the patient between the schedule day and the appointment day influences it's no showing?\n# - There are patients no showing in appointments on the same day?\n#\n# \n# ## 2. Data Wrangling\n#\n# In this section the data will be analysed and cleaned, dealing with missing values or weird values.\n# Also we're gonna explore in a deeper lever in a way that maybe we can formulate more questions.\n#\n# \n# ### 2.1. General Properties\n#\n# Here we're gonna explore our dataset properties checking for:\n# - What kind of variables we need to:\n# - convert the data type\n# - drop from the dataset\n# - Check for duplicates\n# - Check for weird values (outliers)\n# - Gather more information about a specific variable\n# - Check if we need to create more columns with usefull data for the exploration\n\n\n# checking column information for missing values and strange types\ndf.info()\n\n\n# checking for general data duplicates\ndf.duplicated().sum(), df.PatientId.duplicated(\n).sum(), df.AppointmentID.duplicated().sum()\n\n\n# checking all possible values on some columns\nprint(df.Gender.unique())\nprint(sorted(df.Age.unique()))\nprint(sorted(df.Neighbourhood.unique()))\nprint(df.Scholarship.unique())\nprint(df.Hipertension.unique())\nprint(df.Diabetes.unique())\nprint(df.Alcoholism.unique())\nprint(df.Handcap.unique())\nprint(df.SMS_received.unique())\nprint(df['No-show'].unique())\n\n\n# It looks like we have a good dataset:\n# - no missing lines\n# - no duplicaded AppointmentID\n# - we discover that indeed we need the **Patient ID** since it seems some patients try to make new appointments\n# - there are no weird values on most columns\n#\n# But we'll need to do some cleaning here:\n# - we need to fix some data typings\n# - ScheduledDay and AppointmentDay makes sense to be a date/datetime type\n# - No-Show makes sense to be a boolean\n# - PatientId makes sense to be converted as string to prevent from being applied as a numerical operation since it represents the patient identification????\n# - Neighbourhood colulde be convert to caterogical but before - predictong\n# - **Appointment ID** seems to not be usefull for this analysis\n# - **Handcap** variable have values beyond True and False, and we can see [here](https://www.kaggle.com/joniarroba/noshowappointments/discussion/29699#229356) that this occurs because the handcap field represents the number of patient disabilities. But it could by catedgorize, eg. in Poland we heve 3 group of level handicap\n#\n# And we can also make more questions:\n# - On the problem summary they don't mention on which location we're analysing, I've thought that we're analysing data from several cities from Brazil, but it seams that we're seing a specific region\n# - We can see [here](https://www.kaggle.com/joniarroba/noshowappointments/discussion/38330#) that this dataset contains appointments from *Vit\u00f3ria - ES* in Brazil and this turns out to be even more impressive that there are more than 100k schedulled in a ~2 month period in 2016\n#\n# Now we can analyse also the neighbourhood data either!\n#\n# \n# ### 2.2. Data Cleaning\n#\n# Here we're need to:\n# - remove useless columns\n# - rename the columns to use easier names during the exploration while fixing typos\n# - format patient id column to string\n# - format all date related columns to the correct type\n# - remove the weird age value that is less than zero\n# - format the handicap field correctly, since we only want to know if the patient is handicap and not how many disabilities they have\n# - add new columns\n# - appointment_week_day: to show what day of the week the appointment was scheduled\n# - appointment_waiting_time: waiting time to the appointment\n\n\n# let's remove some useless columns\n# I think the appointmentID is useless for this analysis\ndf.drop(['AppointmentID'], axis=1, inplace=True)\ndf.columns\n\n\n# renaming all columns to simpler names for our exploration\ndf.rename(columns={'PatientId': 'patient_id', 'ScheduledDay': 'scheduled_day', 'AppointmentDay': 'appointment_day',\n 'SMS_received': 'received_sms', 'No-show': 'no_show', 'Handcap': 'handicap'}, inplace=True)\ndf.rename(columns=lambda x: x.lower(), inplace=True)\ndf.columns\n\n\n# formatting the patient_id column as string\ndf.patient_id = df.patient_id.apply(lambda patient: str(int(patient)))\n\n\n# formatting the date time 'scheduled_day' and 'appointment_day' columns\n# i'm just testing different forms of time conversion here\ndf.scheduled_day = pd.to_datetime(df.scheduled_day)\ndf.appointment_day = df.appointment_day.apply(np.datetime64)\n\ndf.scheduled_day.head(1), df.appointment_day.head(1)\n\n\n# formatting the 'no_show' column with lower cases\ndf.no_show = df.no_show.map({'No': 'no', 'Yes': 'yes'})\n\ndf.no_show.unique()\n\n\ndf.age.hist(figsize=(15, 8), bins=115)\n\n\n# discart the ages bellow zero\ndf = df.query('age >= 0')\nprint(sorted(df.age.unique()))\n\n\ndf['handicap'].unique()\n\n\ndf['handicap'].value_counts()\n\n\n# remove the weird values from handcap variable\ndf.loc[df.handicap > 1, 'handicap'] = 1\ndf.handicap.unique()\n\n\n# creating the first column \"appointment_week_day\"\ndf['appointment_week_day'] = df.appointment_day.map(lambda day: day.day_name())\ndf.appointment_week_day.head(7000)\n\n\n# solving problem \"DatetimeArray subtraction must have the same timezones or no timezones\"\n\n#df['appointment_day'] = df['appointment_day'].dt.tz_convert(None)\ndf['scheduled_day'] = df['scheduled_day'].dt.tz_convert(None)\n\n\n# creating the second column \"appointment_waiting_time\"\ndf[\"appointment_waiting_days\"] = df.appointment_day - df.scheduled_day\ndf.appointment_waiting_days.head(10)\n\n\n# well it seams that some are treated on the same day that they scheduled\n# we can prevent that weird value by calculating the the \"absolute value\" of this column\n# and then converting the \"time\" to \"days\"\ndf.appointment_waiting_days = df.appointment_waiting_days.abs().dt.days\ndf.appointment_waiting_days.head(10)\n\n\n# let's see how our data looks like after all cleanning\ndf.head(5)\n\n\n# It seams we have all the data we need to start exploring and answer the questions.\n#\n# \n# ## 3. Exploratory Data Analysis\n#\n# First, let's review all questions that I want to answer:\n#\n# - Based on the variables we have, what is the most relevant factor that is influencing the patient to no showing the medical appointment?\n# - There are any relation on these variables that can lead us to a more specific group of patients?\n# - The day of the week of the appointment influences the patient no showing?\n# - The waiting time of the patient between the schedule day and the appointment day influences it's no showing?\n# - There are patients no showing in appointments on the same day?\n#\n# Let's:\n# - analyse our data\n# - mix them up\n# - get assumptions along the way\n# - answer our questions\n\n", "project_metadata": {"full_name": "DataWorkshop-Foundation/warsaw-project-2", "description": "Warsaw group | Project 2 | Appointment No-Shows", "topics": [], "git_url": "git://github.com/DataWorkshop-Foundation/warsaw-project-2.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2020-07-15T11:28:37Z", "size": 95131, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 13902464}, "last_updated": "2021-01-07T19:45:24Z"}, "intent": "# re-see our dataset description"}, {"original_comment": " # Execute the circuit\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"QCBQ\n\n# # Implementing Variational Quantum Algorithms\n\n# **Author:** Ryan LaRose\n#\n# (lightly modified by Justin for use in the QuIC seminar)\n\n# In this notebook, we'll introduce a class of circuits/algorithms known as **variational quantum algorithms**. Variational algorithms are a class of algorithms that may foreseeably be implemented on near term intermediate scale quantum computers, as opposed to things like Shor's factoring algorithm which require large scale, fault tolerant quantum computers. The idea is to build circuits with parameters we may vary, estimate the expectation values of those circuits, compute some cost function, and implement classical optimization algorithms so we can figure out how to vary our circuit to get closer to the true answer. Qiskit has a package (Qiskit Aqua) that does this all behind the scenes, but it's important to understand what's actually going on in the code, so we'll implement all these steps ourselves!\n\n# ## Learning goals\n\n# (1) Be able to implement a circuit with variable parameters and update these parameters.\n#\n# (2) Be able to estimate expectation values using a quantum circuit.\n#\n# (3) Understand how to compute a cost function with a quantum circuit.\n#\n# (4) Use a classical optimization algorithm to find the best parameters.\n\n# ## Helpful background\n\n# * [Born's rule](https://en.wikipedia.org/wiki/Born_rule) for probabilities in quantum mechanics.\n\n#%%\n\n\"\"\"Imports for the notebook.\"\"\"\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom scipy.optimize import minimize\n\nimport qiskit as qis\n\n\n# ## Circuits with parameters\n\n# A key characteristic of any variational quantum algorithm is a parameterized circuit. More generally, variational quantum algorithms are instances of the well-known variational principle of quantum mechanics. This principle states that you can never get below the ground state energy of a particular Hamiltonian. The proof of this theorem is \"duh, ground state means lowest energy.\" The formal proof is just a mathematical translation of this fact.\n\n# **Theorem (The Variational Principle):** Let $H = H^\\dagger$ be a Hamiltonian with spectrum (energy) $E_0 \\le E_1 \\le \\cdots \\le E_n$. Then, for any valid wavefunction $|\\psi\\rangle$,\n#\n# $$ E_0 \\le \\langle \\psi | H | \\psi \\rangle .$$\n\n# *Proof*: Write $|\\psi\\rangle$ in the eigenbasis of $H$\n#\n# $$ |\\psi\\rangle = \\sum_n c_n |\\psi_n\\rangle $$\n#\n# where $c_n := \\langle \\psi | \\psi_n \\rangle$ and $H |\\psi_n \\rangle = E_n |\\psi_n\\rangle$. Note that $\\sum_n |c_n|^2 = 1$ by virtue of proper normalization. Now,\n#\n# \\begin{align}\n# \\langle \\psi | H | \\psi \\rangle &= \\left[ \\sum_m c_m | \\psi_m \\rangle \\right]^\\dagger H \\left[ \\sum_n c_n |\\psi_n\\rangle \\right] \\\\\n# &= \\sum_{m, n} E_n c_m^* c_n \\langle \\psi_m | \\psi_n \\rangle \\\\\n# &= \\sum_n E_n |c_n|^2 \\\\\n# &\\ge E_0 \\sum_n |c_n|^2 \\\\\n# &= E_0 .\n# \\end{align}\n#\n# _______________________________________________________________________________\n\n# The **big idea** in all variational methods is to prepare some \"**ansatz\"** wavefunction\n#\n# $$ |\\psi\\rangle = |\\psi(\\mathbf{\\alpha}) \\rangle $$\n#\n# parameterized by $\\mathbf{\\alpha} = (\\alpha_1, \\alpha_2, ..., \\alpha_n)$. When solving quantum mechanics problems, we then vary $\\mathbf{\\alpha}$ to try and minimize $\\langle\\psi(\\alpha)|\\mathbf{H}|\\psi(\\alpha)\\rangle$ and find a wavefunction close to the ground state wavefunction. In the context of quantum computing, we prepare states by implementing gates, so our gates should have some tunable parameters in them. This is what we mean by \"circuits with parameters.\"\n\n# ### Defining circuits with parameters\n\n# The code cell below shows an example of defining a two qubit circuit with two parameters. Run this cell to visualize the circuit.\n\n#%%\n\n\"\"\"Defining circuits with parameters.\"\"\"\n# Get a circuit and registers\nqreg = qis.QuantumRegister(2)\ncreg = qis.ClassicalRegister(2)\ncirc = qis.QuantumCircuit(qreg, creg)\n\n# Add gates with particular parameters\ncirc.h(qreg)\ncirc.rx(0.2, qreg[0])\ncirc.cx(qreg[0], qreg[1])\ncirc.ry(0.1, qreg[1])\n\n# Visualize the circuit\nprint(circ)\n\n\n# The angles come in through single qubit rotation gates. For this example, we instantiated these angles with example (arbitrary) values. Generally, it is useful to have some means of quickly generating the same circuit but with different variational parameters. In terms of programming, this can be done in a variety of ways, including symbolic representation of parameters or functions. We'll use the latter case for this notebook.\n\n# **Do this:** In the cell below, write a function in python which inputs two paramters (floats) and returns the above circuit with rotation angles equal to these parameters. If you've never written a python function before, ask for help! It's super easy (no curly brackets required). Once you've done that, verify you can call your function by running the cell under that.\n\n#%%\n\ndef circuit(alpha1: float, alpha2: float):\n \"\"\"Returns the circuit above with the input parameters.\"\"\"\n # Your code here!\n\n return circ\n\n#%%\n\nmy_circ = circuit(0.2, 0.3)\nprint(my_circ)\n\n\n# We can now easily vary the parameters in our circuit! You could of course do this by creating a new cell and calling the function with several different parameters. (You should do this to at least make sure your function works!)\n#\n# The reason we vary parameters is to minimize some energy (which comes from a cost function/Hamiltonian). This involves two things:\n#\n# 1. Computing the energy.\n# 1. Varying the parameters.\n#\n# We'll break down computing energy into two sub-steps: Individual expectation values, and weighted sums of expectation values.\n\n# ## Computing expectation values\n\n# So far, we know that variational algorithms tackle the problem\n#\n# $$ \\min_{\\alpha} \\langle \\psi(\\alpha) | H | \\psi(\\alpha) \\rangle $$\n#\n# **How do we compute expectation values like $\\langle \\psi | H | \\psi \\rangle$ on a quantum computer?**\n\n# ### A diagonal operator\n\n# Suppose first we have a one qubit wavefunction $|\\psi\\rangle$ and we want to compute\n#\n# $$\\langle \\psi | Z | \\psi \\rangle.$$\n#\n# where $Z$ is the usual Pauli suspect\n#\n# $$ Z = \\left[ \\begin{matrix}\n# 1 & 0 \\\\\n# 0 & -1 \\\\\n# \\end{matrix} \\right] $$\n#\n# in the computational basis.\n#\n# Starting from first principles, Born's rule tells us that the probability of measuring $0$ is\n#\n# $$ p(0) = | \\langle \\psi | 0 \\rangle |^2 =\n# \\langle \\psi | 0 \\rangle \\langle 0 | \\psi \\rangle =\n# \\langle \\psi | \\Pi_0 | \\psi \\rangle $$\n#\n# where $\\Pi_0 := |0\\rangle \\langle 0 |$ is the projector onto the $|0\\rangle$ state.\n\n# **Question:** Write out the $2 \\times 2$ matrix representation of $\\Pi_0$ in the computational basis.\n\n# **Answer:** Answer the above question here!\n\n# Similarly, the probability of measuring $1$ is\n#\n# $$ p(1) = | \\langle \\psi | 1 \\rangle |^2 =\n# \\langle \\psi | 1 \\rangle \\langle 1 | \\psi \\rangle =\n# \\langle \\psi | \\Pi_1 | \\psi \\rangle $$\n#\n# where $\\Pi_1 := |1 \\rangle \\langle 1 |$ is the projector onto the $|1\\rangle$ state.\n\n# **Question:** Write out the $2 \\times 2$ matrix representation of $\\Pi_1$ in the computational basis.\n\n# **Answer:** Answer the above question here!\n\n# **Question:** Prove that $\\Pi_0 + \\Pi_1 = I$ (the identity). Use this to check that $p(0) + p(1) = 1$, as it must.\n\n# **Answer:** Answer the above question here!\n\n# The previous question showed that if we add together the projectors $\\Pi_0$ and $\\Pi_1$, nothing interesting happens. The follow question shows that if we **subtract** the projectors, something interesting happens!\n\n# **Question:** Prove that $\\Pi_0 - \\Pi_1 = Z$.\n\n# **Answer:** Answer the above question here!\n\n# As you showed above, we have\n#\n# $$ Z = \\Pi_0 - \\Pi_1. $$\n#\n# We can use this to estimate $\\langle \\psi | Z | \\psi \\rangle$ by measuring in the standard basis and doing a bit of \"classical postprocessing,\" which here just means subtracting the outcome probabilities.\n#\n# $$ \\langle \\psi | Z | \\psi \\rangle = \\langle \\psi | (\\Pi_0 - \\Pi_1) | \\psi \\rangle =\n# \\langle \\psi | \\Pi_0 | \\psi \\rangle - \\langle \\psi | \\Pi_1 | \\psi \\rangle =\n# p(0) - p(1) . $$\n#\n# Of course, we won't have the full probability distribution $p$, but instead we'll have to sample from the circuit many times ($N$ times) to get a good estimate $p(0) \\approx f(0) / N$, where $f(0)$ is the frequency of measuring $0$. Similarly for $p(1)$.\n\n# **Do this:** In the following cell, a one qubit state $|\\psi\\rangle = H|0\\rangle$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | Z | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n\n#%%\n\n\"\"\"Estimating a one qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(1)\ncreg = qis.ClassicalRegister(1)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.h(qreg)\n\n# Your code here!\n\n\n# ### A non-diagonal operator\n\n# Suppose now, for example, we want to measure the expecation value of $X$, which is not diagonal in the computational basis:\n#\n# $$\\langle \\psi | X | \\psi \\rangle.$$\n#\n# Here, the key \"trick\" is to rotate to the eigenbasis, where $X$ becomes diagonal. You may recall or wish to prove that\n#\n# $$ HXH = Z . $$\n#\n# Let $|\\psi'\\rangle = H |\\psi \\rangle$, and suppose what happens when we measure $|\\psi'\\rangle$ in the computational basis. By the same argument above, we have\n#\n# $$ p(0) = \\langle \\psi ' | \\Pi_0 | \\psi' \\rangle =\n# \\langle \\psi | H \\Pi_0 H | \\psi \\rangle,$$\n#\n# where in the last step we substituted $|\\psi'\\rangle = H |\\psi\\rangle$. Similarly,\n#\n# $$ p(1) = \\langle \\psi ' | \\Pi_1 | \\psi' \\rangle =\n# \\langle \\psi | H \\Pi_1 H | \\psi \\rangle . $$\n#\n# We can now subtract the probabilities, again using the fact that $Z = \\Pi_0 - \\Pi_1$, as above:\n#\n# $$ p(0) - p(1) = \\langle \\psi | H \\Pi_0 H | \\psi \\rangle - \\langle \\psi | H \\Pi_1 H | \\psi \\rangle =\n# \\langle \\psi | H (\\Pi_0 - \\Pi_1 ) H | \\psi \\rangle =\n# \\langle \\psi | H Z H | \\psi \\rangle =\n# \\langle \\psi | X | \\psi \\rangle, $$\n#\n# which is exactly the quantity we want to compute.\n#\n# This is why we implement the \"appropriate rotation\" when we want to measure the expectation value of an operator. The key insight is that measurement probabilities can be written as expectation values of projectors.\n\n# **Do this:** In the following cell, a one qubit state $|\\psi\\rangle = |-\\rangle = 1/\\sqrt{2}(|0\\rangle - |1\\rangle)$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | X | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n#\n# Note that the output of counts is a [dictionary](https://docs.python.org/3/library/stdtypes.html#typesmapping) containing the result paired to the number of counts. If your circuit produces no counts of a certain result, it won't output a dictionary element for that result, so you may have to include an ```if``` statement to account for that.\n\n#%%\n\n\"\"\"Estimating a one qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(1)\ncreg = qis.ClassicalRegister(1)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.x(qreg)\ncirc.h(qreg)\n\n# Your code here!\n\n\n# ### A two-qubit operator\n\n# We now know how to estimate $\\langle Z \\rangle$ and $\\langle X \\rangle$. **What if we want to estimate $\\langle Z \\otimes X \\rangle$?**\n#\n# The trick is to do the same thing! We still rotate to the eigenbasis of each operator and measure both qubits. We now have four possible measurement outcomes and so four probabilities: $p(00)$, $p(01)$, $p(10)$, and $p(11)$. What is the appropriate \"classical post-processing\" to do with these sampled outcomes?\n#\n# We can measuring the expectations separately and expand the product:\n#\n# $$ (p_0(0) - p_0(1))(p_1(0) - p_1(1)) = p(00) - p(01) - p(10) + p(11). $$\n#\n# Note that the subscript on the LHS refers to the qubit (either $0$ or $1$), and the value in parentheses refers to the measurement outcome. On the RHS, $p(00)$ is equivalent to $p_0(0) p_1(0)$, etc.\n\n# **Do this:** In the following cell, a two qubit state $|\\psi\\rangle = |+\\rangle = 1/\\sqrt{2}(|0\\rangle + |1\\rangle)$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | Z \\otimes X | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n\n#%%\n\n\"\"\"Estimating a two qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(2)\ncreg = qis.ClassicalRegister(2)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.h(qreg[1])\n\n# Your code here!\n\n\n# ### A general operator\n\n# Thankfully for our discussion, any $n$-qubit operator $O$ can be decomposed in the Pauli basis:\n#\n# $$ O = \\sum_i o_i \\sigma_{i_1} \\otimes \\sigma_{i_2} \\otimes \\cdots \\otimes \\sigma_{i_n} $$\n#\n# Here, $o_i \\in \\mathbb{C}$ is a scalar and each $\\sigma$ is a Pauli. By linearity, to evaluate the expectation, we can evaluate the expectation of each Pauli string:\n#\n# $$ \\langle O \\rangle = \\sum_i o_i \\langle \\sigma_{i_1} \\otimes \\sigma_{i_2} \\otimes \\cdots \\otimes \\sigma_{i_n} \\rangle $$\n#\n# Thus, **to evaluate the expectation of any operator, it suffices to evaluate expectations of arbitrary Pauli strings**. Below, you are asked to write a function which generalizes the two qubit expectation value measurement $\\langle Z \\otimes X \\rangle$ to any number of qubits. Before tackling this, you may wish to prove the following fact to yourself, based on the expansion of the two-qubit expectation above.\n\n# **Question**: We can think of the classical post-processing for computing expectation values as follows. From our circuit of $n$ qubits, we measure the frequency of $2^n$ possible bitstrings. For example, for $n = 3$, the possible bitstrings we can measure are 000, 001, 010, 011, 100, 101, 110, and 111. The classical post-processing consists of summing up these frequencies **with the appropriate sign**. Prove that the sign for bitstring $z_i$ is $(-1)^{N_0(z_i)}$ where $N_0(z_i)$ is the number of zeros in bit string $z_i$.\n\n# **Do this:** Answer the above question here!\n\n# The block of code below contains a function ```expectation_circuit``` for finding appropriately modifying a circuit to measure the expectation value of a general string of operators. The function takes in an $n$ qubit circuit and a string of length $n$ containing pauli operators, and outputs the appropriate circuit to run to estimate the expectation value of the operator defined by the string. For example, if we had a 3 qubit circuit and wanted to measure $I\\otimes X \\otimes Z$, we would pass the circuit and ``` \"IXZ\" ``` to the function.\n#\n# Read through this function, and make sure you understand what's going on!\n\n#%%\n\n\"\"\"Helper function to evaluate the expectation of any valid Pauli string.\"\"\"\n\n\ndef expectation_circuit(circuit: qis.QuantumCircuit, pauli_string: str) -> qis.QuantumCircuit:\n \"\"\"Returns a circuit to compute expectation of the Pauli string in the \n state prepared by the input circuit.\n\n Args:\n circuit: Prepares the state |\\psi> from |0>.\n pauli_string: String (tensor product) of Paulis to evaluate\n an expectation of. The length of pauli_string\n must be equal to the total number of qubits in\n the circuit. (Use identities for no operator!)\n \"\"\"\n temp = circuit.copy()\n\n if len(circuit.qregs) != 1:\n raise ValueError(\"Circuit should have only one quantum register.\")\n if len(circuit.cregs) != 1:\n print(\"# cregs =\", len(circuit.cregs))\n raise ValueError(\"Circuit should have only one classical register.\")\n\n qreg = circuit.qregs[0]\n creg = circuit.cregs[0]\n nqubits = len(qreg)\n pauli_string = pauli_string.upper().strip()\n\n if len(pauli_string) != nqubits:\n raise ValueError(\n f\"Circuit has {nqubits} qubits but pauli_string has {len(pauli_string)} operators.\"\n )\n\n for (qubit, pauli) in enumerate(pauli_string):\n if pauli == \"I\":\n continue\n elif pauli == \"X\":\n temp.h(qreg[qubit])\n temp.measure(qreg[qubit], creg[qubit])\n elif pauli == \"Y\":\n temp.s(qreg[qubit])\n temp.h(qreg[qubit])\n temp.measure(qreg[qubit], creg[qubit])\n elif pauli == \"Z\":\n temp.measure(qreg[qubit], creg[qubit])\n else:\n raise ValueError(\n f\"{pauli} is an invalid Pauli string key. Should be I, X, Y, or Z.\")\n\n return temp\n\n\n# **Do this:** As a sanity check, let's test this function below on measuring $X \\otimes Y$. Use the function to modify the input circuit, then print out the modified circuit. Test this on other Pauli strings as well.\n\n#%%\n\n\"\"\"Test your function here.\"\"\"\ncirc = circuit(np.pi / 2, np.pi / 4)\nprint(\"Bare circuit:\")\nprint(circ)\n\n# Your code here!\n\n\n# The following cell contains a function that takes in an \"expectation circuit\" (a circuit with the appropriate transformations applied at the end) and performs the classical post-processing to return an expectation value. Once again, read through this function and make sure you understand what's going on!\n\n#%%\n\n\"\"\"Function to execute the circuit and do the postprocessing.\"\"\"\n\n\ndef run_and_process(circuit: qis.QuantumCircuit, shots: int = 10000) -> float:\n \"\"\"Runs an 'expectation circuit' and returns the expectation value of the\n measured Pauli string.\n\n Args:\n circuit: Circuit to execute.\n shots: Number of circuit executions.\n \"\"\"", "target_code": " backend = qis.BasicAer.get_backend(\"qasm_simulator\")\n job = qis.execute(circuit, backend, shots=shots)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"QCBQ\n\n# # Implementing Variational Quantum Algorithms\n\n# **Author:** Ryan LaRose\n#\n# (lightly modified by Justin for use in the QuIC seminar)\n\n# In this notebook, we'll introduce a class of circuits/algorithms known as **variational quantum algorithms**. Variational algorithms are a class of algorithms that may foreseeably be implemented on near term intermediate scale quantum computers, as opposed to things like Shor's factoring algorithm which require large scale, fault tolerant quantum computers. The idea is to build circuits with parameters we may vary, estimate the expectation values of those circuits, compute some cost function, and implement classical optimization algorithms so we can figure out how to vary our circuit to get closer to the true answer. Qiskit has a package (Qiskit Aqua) that does this all behind the scenes, but it's important to understand what's actually going on in the code, so we'll implement all these steps ourselves!\n\n# ## Learning goals\n\n# (1) Be able to implement a circuit with variable parameters and update these parameters.\n#\n# (2) Be able to estimate expectation values using a quantum circuit.\n#\n# (3) Understand how to compute a cost function with a quantum circuit.\n#\n# (4) Use a classical optimization algorithm to find the best parameters.\n\n# ## Helpful background\n\n# * [Born's rule](https://en.wikipedia.org/wiki/Born_rule) for probabilities in quantum mechanics.\n\n\n\"\"\"Imports for the notebook.\"\"\"\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom scipy.optimize import minimize\n\nimport qiskit as qis\n\n\n# ## Circuits with parameters\n\n# A key characteristic of any variational quantum algorithm is a parameterized circuit. More generally, variational quantum algorithms are instances of the well-known variational principle of quantum mechanics. This principle states that you can never get below the ground state energy of a particular Hamiltonian. The proof of this theorem is \"duh, ground state means lowest energy.\" The formal proof is just a mathematical translation of this fact.\n\n# **Theorem (The Variational Principle):** Let $H = H^\\dagger$ be a Hamiltonian with spectrum (energy) $E_0 \\le E_1 \\le \\cdots \\le E_n$. Then, for any valid wavefunction $|\\psi\\rangle$,\n#\n# $$ E_0 \\le \\langle \\psi | H | \\psi \\rangle .$$\n\n# *Proof*: Write $|\\psi\\rangle$ in the eigenbasis of $H$\n#\n# $$ |\\psi\\rangle = \\sum_n c_n |\\psi_n\\rangle $$\n#\n# where $c_n := \\langle \\psi | \\psi_n \\rangle$ and $H |\\psi_n \\rangle = E_n |\\psi_n\\rangle$. Note that $\\sum_n |c_n|^2 = 1$ by virtue of proper normalization. Now,\n#\n# \\begin{align}\n# \\langle \\psi | H | \\psi \\rangle &= \\left[ \\sum_m c_m | \\psi_m \\rangle \\right]^\\dagger H \\left[ \\sum_n c_n |\\psi_n\\rangle \\right] \\\\\n# &= \\sum_{m, n} E_n c_m^* c_n \\langle \\psi_m | \\psi_n \\rangle \\\\\n# &= \\sum_n E_n |c_n|^2 \\\\\n# &\\ge E_0 \\sum_n |c_n|^2 \\\\\n# &= E_0 .\n# \\end{align}\n#\n# _______________________________________________________________________________\n\n# The **big idea** in all variational methods is to prepare some \"**ansatz\"** wavefunction\n#\n# $$ |\\psi\\rangle = |\\psi(\\mathbf{\\alpha}) \\rangle $$\n#\n# parameterized by $\\mathbf{\\alpha} = (\\alpha_1, \\alpha_2, ..., \\alpha_n)$. When solving quantum mechanics problems, we then vary $\\mathbf{\\alpha}$ to try and minimize $\\langle\\psi(\\alpha)|\\mathbf{H}|\\psi(\\alpha)\\rangle$ and find a wavefunction close to the ground state wavefunction. In the context of quantum computing, we prepare states by implementing gates, so our gates should have some tunable parameters in them. This is what we mean by \"circuits with parameters.\"\n\n# ### Defining circuits with parameters\n\n# The code cell below shows an example of defining a two qubit circuit with two parameters. Run this cell to visualize the circuit.\n\n\n\"\"\"Defining circuits with parameters.\"\"\"\n# Get a circuit and registers\nqreg = qis.QuantumRegister(2)\ncreg = qis.ClassicalRegister(2)\ncirc = qis.QuantumCircuit(qreg, creg)\n\n# Add gates with particular parameters\ncirc.h(qreg)\ncirc.rx(0.2, qreg[0])\ncirc.cx(qreg[0], qreg[1])\ncirc.ry(0.1, qreg[1])\n\n# Visualize the circuit\nprint(circ)\n\n\n# The angles come in through single qubit rotation gates. For this example, we instantiated these angles with example (arbitrary) values. Generally, it is useful to have some means of quickly generating the same circuit but with different variational parameters. In terms of programming, this can be done in a variety of ways, including symbolic representation of parameters or functions. We'll use the latter case for this notebook.\n\n# **Do this:** In the cell below, write a function in python which inputs two paramters (floats) and returns the above circuit with rotation angles equal to these parameters. If you've never written a python function before, ask for help! It's super easy (no curly brackets required). Once you've done that, verify you can call your function by running the cell under that.\n\n\ndef circuit(alpha1: float, alpha2: float):\n \"\"\"Returns the circuit above with the input parameters.\"\"\"\n # Your code here!\n\n return circ\n\n\nmy_circ = circuit(0.2, 0.3)\nprint(my_circ)\n\n\n# We can now easily vary the parameters in our circuit! You could of course do this by creating a new cell and calling the function with several different parameters. (You should do this to at least make sure your function works!)\n#\n# The reason we vary parameters is to minimize some energy (which comes from a cost function/Hamiltonian). This involves two things:\n#\n# 1. Computing the energy.\n# 1. Varying the parameters.\n#\n# We'll break down computing energy into two sub-steps: Individual expectation values, and weighted sums of expectation values.\n\n# ## Computing expectation values\n\n# So far, we know that variational algorithms tackle the problem\n#\n# $$ \\min_{\\alpha} \\langle \\psi(\\alpha) | H | \\psi(\\alpha) \\rangle $$\n#\n# **How do we compute expectation values like $\\langle \\psi | H | \\psi \\rangle$ on a quantum computer?**\n\n# ### A diagonal operator\n\n# Suppose first we have a one qubit wavefunction $|\\psi\\rangle$ and we want to compute\n#\n# $$\\langle \\psi | Z | \\psi \\rangle.$$\n#\n# where $Z$ is the usual Pauli suspect\n#\n# $$ Z = \\left[ \\begin{matrix}\n# 1 & 0 \\\\\n# 0 & -1 \\\\\n# \\end{matrix} \\right] $$\n#\n# in the computational basis.\n#\n# Starting from first principles, Born's rule tells us that the probability of measuring $0$ is\n#\n# $$ p(0) = | \\langle \\psi | 0 \\rangle |^2 =\n# \\langle \\psi | 0 \\rangle \\langle 0 | \\psi \\rangle =\n# \\langle \\psi | \\Pi_0 | \\psi \\rangle $$\n#\n# where $\\Pi_0 := |0\\rangle \\langle 0 |$ is the projector onto the $|0\\rangle$ state.\n\n# **Question:** Write out the $2 \\times 2$ matrix representation of $\\Pi_0$ in the computational basis.\n\n# **Answer:** Answer the above question here!\n\n# Similarly, the probability of measuring $1$ is\n#\n# $$ p(1) = | \\langle \\psi | 1 \\rangle |^2 =\n# \\langle \\psi | 1 \\rangle \\langle 1 | \\psi \\rangle =\n# \\langle \\psi | \\Pi_1 | \\psi \\rangle $$\n#\n# where $\\Pi_1 := |1 \\rangle \\langle 1 |$ is the projector onto the $|1\\rangle$ state.\n\n# **Question:** Write out the $2 \\times 2$ matrix representation of $\\Pi_1$ in the computational basis.\n\n# **Answer:** Answer the above question here!\n\n# **Question:** Prove that $\\Pi_0 + \\Pi_1 = I$ (the identity). Use this to check that $p(0) + p(1) = 1$, as it must.\n\n# **Answer:** Answer the above question here!\n\n# The previous question showed that if we add together the projectors $\\Pi_0$ and $\\Pi_1$, nothing interesting happens. The follow question shows that if we **subtract** the projectors, something interesting happens!\n\n# **Question:** Prove that $\\Pi_0 - \\Pi_1 = Z$.\n\n# **Answer:** Answer the above question here!\n\n# As you showed above, we have\n#\n# $$ Z = \\Pi_0 - \\Pi_1. $$\n#\n# We can use this to estimate $\\langle \\psi | Z | \\psi \\rangle$ by measuring in the standard basis and doing a bit of \"classical postprocessing,\" which here just means subtracting the outcome probabilities.\n#\n# $$ \\langle \\psi | Z | \\psi \\rangle = \\langle \\psi | (\\Pi_0 - \\Pi_1) | \\psi \\rangle =\n# \\langle \\psi | \\Pi_0 | \\psi \\rangle - \\langle \\psi | \\Pi_1 | \\psi \\rangle =\n# p(0) - p(1) . $$\n#\n# Of course, we won't have the full probability distribution $p$, but instead we'll have to sample from the circuit many times ($N$ times) to get a good estimate $p(0) \\approx f(0) / N$, where $f(0)$ is the frequency of measuring $0$. Similarly for $p(1)$.\n\n# **Do this:** In the following cell, a one qubit state $|\\psi\\rangle = H|0\\rangle$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | Z | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n\n\n\"\"\"Estimating a one qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(1)\ncreg = qis.ClassicalRegister(1)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.h(qreg)\n\n# Your code here!\n\n\n# ### A non-diagonal operator\n\n# Suppose now, for example, we want to measure the expecation value of $X$, which is not diagonal in the computational basis:\n#\n# $$\\langle \\psi | X | \\psi \\rangle.$$\n#\n# Here, the key \"trick\" is to rotate to the eigenbasis, where $X$ becomes diagonal. You may recall or wish to prove that\n#\n# $$ HXH = Z . $$\n#\n# Let $|\\psi'\\rangle = H |\\psi \\rangle$, and suppose what happens when we measure $|\\psi'\\rangle$ in the computational basis. By the same argument above, we have\n#\n# $$ p(0) = \\langle \\psi ' | \\Pi_0 | \\psi' \\rangle =\n# \\langle \\psi | H \\Pi_0 H | \\psi \\rangle,$$\n#\n# where in the last step we substituted $|\\psi'\\rangle = H |\\psi\\rangle$. Similarly,\n#\n# $$ p(1) = \\langle \\psi ' | \\Pi_1 | \\psi' \\rangle =\n# \\langle \\psi | H \\Pi_1 H | \\psi \\rangle . $$\n#\n# We can now subtract the probabilities, again using the fact that $Z = \\Pi_0 - \\Pi_1$, as above:\n#\n# $$ p(0) - p(1) = \\langle \\psi | H \\Pi_0 H | \\psi \\rangle - \\langle \\psi | H \\Pi_1 H | \\psi \\rangle =\n# \\langle \\psi | H (\\Pi_0 - \\Pi_1 ) H | \\psi \\rangle =\n# \\langle \\psi | H Z H | \\psi \\rangle =\n# \\langle \\psi | X | \\psi \\rangle, $$\n#\n# which is exactly the quantity we want to compute.\n#\n# This is why we implement the \"appropriate rotation\" when we want to measure the expectation value of an operator. The key insight is that measurement probabilities can be written as expectation values of projectors.\n\n# **Do this:** In the following cell, a one qubit state $|\\psi\\rangle = |-\\rangle = 1/\\sqrt{2}(|0\\rangle - |1\\rangle)$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | X | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n#\n# Note that the output of counts is a [dictionary](https://docs.python.org/3/library/stdtypes.html#typesmapping) containing the result paired to the number of counts. If your circuit produces no counts of a certain result, it won't output a dictionary element for that result, so you may have to include an ```if``` statement to account for that.\n\n\n\"\"\"Estimating a one qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(1)\ncreg = qis.ClassicalRegister(1)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.x(qreg)\ncirc.h(qreg)\n\n# Your code here!\n\n\n# ### A two-qubit operator\n\n# We now know how to estimate $\\langle Z \\rangle$ and $\\langle X \\rangle$. **What if we want to estimate $\\langle Z \\otimes X \\rangle$?**\n#\n# The trick is to do the same thing! We still rotate to the eigenbasis of each operator and measure both qubits. We now have four possible measurement outcomes and so four probabilities: $p(00)$, $p(01)$, $p(10)$, and $p(11)$. What is the appropriate \"classical post-processing\" to do with these sampled outcomes?\n#\n# We can measuring the expectations separately and expand the product:\n#\n# $$ (p_0(0) - p_0(1))(p_1(0) - p_1(1)) = p(00) - p(01) - p(10) + p(11). $$\n#\n# Note that the subscript on the LHS refers to the qubit (either $0$ or $1$), and the value in parentheses refers to the measurement outcome. On the RHS, $p(00)$ is equivalent to $p_0(0) p_1(0)$, etc.\n\n# **Do this:** In the following cell, a two qubit state $|\\psi\\rangle = |+\\rangle = 1/\\sqrt{2}(|0\\rangle + |1\\rangle)$ is prepared for you in a circuit.\n#\n# 1. Estimate $\\langle \\psi | Z \\otimes X | \\psi \\rangle$ by executing the circuit many times and doing the appropriate \"classical post-processing.\"\n# 1. Compute the expectation analytically, and show the results agree.\n\n\n\"\"\"Estimating a two qubit expectation value.\"\"\"\nqreg = qis.QuantumRegister(2)\ncreg = qis.ClassicalRegister(2)\ncirc = qis.QuantumCircuit(qreg, creg)\ncirc.h(qreg[1])\n\n# Your code here!\n\n\n# ### A general operator\n\n# Thankfully for our discussion, any $n$-qubit operator $O$ can be decomposed in the Pauli basis:\n#\n# $$ O = \\sum_i o_i \\sigma_{i_1} \\otimes \\sigma_{i_2} \\otimes \\cdots \\otimes \\sigma_{i_n} $$\n#\n# Here, $o_i \\in \\mathbb{C}$ is a scalar and each $\\sigma$ is a Pauli. By linearity, to evaluate the expectation, we can evaluate the expectation of each Pauli string:\n#\n# $$ \\langle O \\rangle = \\sum_i o_i \\langle \\sigma_{i_1} \\otimes \\sigma_{i_2} \\otimes \\cdots \\otimes \\sigma_{i_n} \\rangle $$\n#\n# Thus, **to evaluate the expectation of any operator, it suffices to evaluate expectations of arbitrary Pauli strings**. Below, you are asked to write a function which generalizes the two qubit expectation value measurement $\\langle Z \\otimes X \\rangle$ to any number of qubits. Before tackling this, you may wish to prove the following fact to yourself, based on the expansion of the two-qubit expectation above.\n\n# **Question**: We can think of the classical post-processing for computing expectation values as follows. From our circuit of $n$ qubits, we measure the frequency of $2^n$ possible bitstrings. For example, for $n = 3$, the possible bitstrings we can measure are 000, 001, 010, 011, 100, 101, 110, and 111. The classical post-processing consists of summing up these frequencies **with the appropriate sign**. Prove that the sign for bitstring $z_i$ is $(-1)^{N_0(z_i)}$ where $N_0(z_i)$ is the number of zeros in bit string $z_i$.\n\n# **Do this:** Answer the above question here!\n\n# The block of code below contains a function ```expectation_circuit``` for finding appropriately modifying a circuit to measure the expectation value of a general string of operators. The function takes in an $n$ qubit circuit and a string of length $n$ containing pauli operators, and outputs the appropriate circuit to run to estimate the expectation value of the operator defined by the string. For example, if we had a 3 qubit circuit and wanted to measure $I\\otimes X \\otimes Z$, we would pass the circuit and ``` \"IXZ\" ``` to the function.\n#\n# Read through this function, and make sure you understand what's going on!\n\n\n\"\"\"Helper function to evaluate the expectation of any valid Pauli string.\"\"\"\n\n\ndef expectation_circuit(circuit: qis.QuantumCircuit, pauli_string: str) -> qis.QuantumCircuit:\n \"\"\"Returns a circuit to compute expectation of the Pauli string in the \n state prepared by the input circuit.\n\n Args:\n circuit: Prepares the state |\\psi> from |0>.\n pauli_string: String (tensor product) of Paulis to evaluate\n an expectation of. The length of pauli_string\n must be equal to the total number of qubits in\n the circuit. (Use identities for no operator!)\n \"\"\"\n temp = circuit.copy()\n\n if len(circuit.qregs) != 1:\n raise ValueError(\"Circuit should have only one quantum register.\")\n if len(circuit.cregs) != 1:\n print(\"# cregs =\", len(circuit.cregs))\n raise ValueError(\"Circuit should have only one classical register.\")\n\n qreg = circuit.qregs[0]\n creg = circuit.cregs[0]\n nqubits = len(qreg)\n pauli_string = pauli_string.upper().strip()\n\n if len(pauli_string) != nqubits:\n raise ValueError(\n f\"Circuit has {nqubits} qubits but pauli_string has {len(pauli_string)} operators.\"\n )\n\n for (qubit, pauli) in enumerate(pauli_string):\n if pauli == \"I\":\n continue\n elif pauli == \"X\":\n temp.h(qreg[qubit])\n temp.measure(qreg[qubit], creg[qubit])\n elif pauli == \"Y\":\n temp.s(qreg[qubit])\n temp.h(qreg[qubit])\n temp.measure(qreg[qubit], creg[qubit])\n elif pauli == \"Z\":\n temp.measure(qreg[qubit], creg[qubit])\n else:\n raise ValueError(\n f\"{pauli} is an invalid Pauli string key. Should be I, X, Y, or Z.\")\n\n return temp\n\n\n# **Do this:** As a sanity check, let's test this function below on measuring $X \\otimes Y$. Use the function to modify the input circuit, then print out the modified circuit. Test this on other Pauli strings as well.\n\n\n\"\"\"Test your function here.\"\"\"\ncirc = circuit(np.pi / 2, np.pi / 4)\nprint(\"Bare circuit:\")\nprint(circ)\n\n# Your code here!\n\n\n# The following cell contains a function that takes in an \"expectation circuit\" (a circuit with the appropriate transformations applied at the end) and performs the classical post-processing to return an expectation value. Once again, read through this function and make sure you understand what's going on!\n\n\n\"\"\"Function to execute the circuit and do the postprocessing.\"\"\"\n\n\ndef run_and_process(circuit: qis.QuantumCircuit, shots: int = 10000) -> float:\n \"\"\"Runs an 'expectation circuit' and returns the expectation value of the\n measured Pauli string.\n\n Args:\n circuit: Circuit to execute.\n shots: Number of circuit executions.\n \"\"\"\n", "project_metadata": {"full_name": "rmlarose/QuIC-Seminar", "description": "Code repository for the QuIC Seminar at Michigan State University.", "topics": [], "git_url": "git://github.com/rmlarose/QuIC-Seminar.git", "stars": 7, "watchers": 7, "forks": 5, "created": "2019-03-27T16:11:09Z", "size": 5820, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1791428}, "last_updated": "2020-12-29T04:12:14Z"}, "intent": " # Execute the circuit"}, {"original_comment": "# train / fit the Support Vector Machine classifier\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n#%%", "target_code": "svm.fit(x_train, y_train)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "intent": "# train / fit the Support Vector Machine classifier"}, {"original_comment": "# loading the saved clf model pickle\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom flask import render_template, request, flash, redirect, Flask\nfrom forms import HouseForms, MortgageInputForm\nimport pickle\nimport dill\nimport pandas as pd\nimport os.path\n\n#%%", "target_code": "import pickle\n\nclf_model = open(\"models/clf_model_new.pkl\", \"rb\")\nmodel = pickle.load(clf_model)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom flask import render_template, request, flash, redirect, Flask\nfrom forms import HouseForms, MortgageInputForm\nimport dill\nimport pandas as pd\nimport os.path\n\n", "project_metadata": {"full_name": "vtyeh/machine-learning-house-predictions", "description": null, "topics": [], "git_url": "git://github.com/vtyeh/machine-learning-house-predictions.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-07-03T21:19:04Z", "size": 17433, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4951470, "HTML": 42959, "Python": 21441, "CSS": 13173, "JavaScript": 1981}, "last_updated": "2018-08-22T19:11:26Z"}, "intent": "# loading the saved clf model pickle"}, {"original_comment": " # Close with the q button\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport cv2\n\n# Mouse Callback Function\n\n\ndef draw_circle(event, x, y, flags, param):\n\n global center, clicked\n\n # Get Mouse Click on Down and Track the Center\n if event == cv2.EVENT_LBUTTONDOWN:\n center = (x, y)\n clicked = False\n\n if event == cv2.EVENT_LBUTTONUP:\n clicked = True\n\n\n# Zero Drawing yet\ncenter = (0, 0)\nclicked = False\n\n# Capture the Video\ncap = cv2.VideoCapture(0)\n\n# Create a Named Window for the Cinnections\ncv2.namedWindow('Testing')\n\n# Bind the draw_circle function to Mouse Clicks\ncv2.setMouseCallback('Testing', draw_circle)\n\nwhile True:\n\n # Capture frame by frame\n ret, frame = cap.read()\n\n # Use if to check if Clicked is True\n if clicked:\n # Draw a Circle on the Frame\n cv2.circle(frame, center=center, radius=50,\n color=(255, 0, 0), thickness=3)\n\n # Display the resulting Frame\n cv2.imshow('Testing', frame)", "target_code": " if cv2.waitKey(1) & 0xFF == ord('q'):\n break\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport cv2\n\n# Mouse Callback Function\n\n\ndef draw_circle(event, x, y, flags, param):\n\n global center, clicked\n\n # Get Mouse Click on Down and Track the Center\n if event == cv2.EVENT_LBUTTONDOWN:\n center = (x, y)\n clicked = False\n\n if event == cv2.EVENT_LBUTTONUP:\n clicked = True\n\n\n# Zero Drawing yet\ncenter = (0, 0)\nclicked = False\n\n# Capture the Video\ncap = cv2.VideoCapture(0)\n\n# Create a Named Window for the Cinnections\ncv2.namedWindow('Testing')\n\n# Bind the draw_circle function to Mouse Clicks\ncv2.setMouseCallback('Testing', draw_circle)\n\nwhile True:\n\n # Capture frame by frame\n ret, frame = cap.read()\n\n # Use if to check if Clicked is True\n if clicked:\n # Draw a Circle on the Frame\n cv2.circle(frame, center=center, radius=50,\n color=(255, 0, 0), thickness=3)\n\n # Display the resulting Frame\n cv2.imshow('Testing', frame)\n", "project_metadata": {"full_name": "EliasPapachristos/Computer_Vision_with_OpenCV", "description": "Computer Vision projects with OpenCV and Python", "topics": [], "git_url": "git://github.com/EliasPapachristos/Computer_Vision_with_OpenCV.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-04-16T19:06:14Z", "size": 31708, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1773505}, "last_updated": "2020-12-11T21:48:08Z"}, "intent": " # Close with the q button"}, {"original_comment": "# Load model from disk.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom tmle.visualizations import plot_confusion_matrix\nfrom sklearn.metrics import confusion_matrix\nimport pickle\nfrom sklearn.metrics import balanced_accuracy_score\nfrom tmle.model_selection import ClassifierOptimizer\nfrom hyperopt import tpe, fmin, hp, Trials, STATUS_OK\nimport hyperopt\nfrom sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score\nfrom joblib import load\nfrom sklearn.svm import LinearSVC\nfrom sklearn.preprocessing import StandardScaler\nfrom tmle.transformers import HOGTransformer\nfrom sklearn.pipeline import Pipeline\nfrom torchvision import transforms\nfrom tmle.dataloaders import ImageFoldersDataset\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# Computations performed in this notebook may take hours. Therefore, we have implemented notebook extension called `skip_cell` which could be used to omit certain cells. We defined global variables `IMPATIENCE` which should be set to `True` when you want to load data and/or models from disk.\n\n#%%\n\nget_ipython().run_line_magic('load_ext', 'skip_cell')\n\n#%%\n\nIMPATIENCE = True\n\n\n# If `SAMPLE` is `None` the entire dataset will be used during training.\n# `N_CHANNELS` controls the number of channels in images (for `grayscale` use `1`, for `RGB` use `3`).\n# `EXPERIMENT_NAME` defines the name of a file that stores information about results of Bayesian optimization of hyperparameters.\n\n#%%\n\nSAMPLE = None\nN_CHANNELS = 1\nEXPERIMENT_NAME = 'shallow_clf_tpe_pipeline_cpu_overfit_penalty'\n\n\n# # Data preparation\n#\n# We start from loading data into memory.\n\n#%%\n\n# Images (both in training and test sets) have different, unregular sizes. Thus, we will use transformations that will first scale the images and then crop (randomly) a square-shaped fragments. After that, images will be converted to `torch.Tensors`.\n#\n# Note that:\n#\n# * `transforms.Resize` will resize the input PIL Image to given size. If size is an int, smaller edge of the image will be mathced to this number, ie. if `height > width`, then image will be rescaled to `(size * height / width, size)`,\n# * `transforms.RandomCrop` will crop the given PIL Image at a random location. If size is an int instead of sequence like `(height, width)`, a square crop `(size, size)` is made.\n\n#%%\n\nsimple_transform = transforms.Compose([\n transforms.Grayscale(),\n transforms.Resize(32),\n transforms.RandomCrop(32),\n transforms.ToTensor()\n])\n\n\n# Then, we will create an object that will allow us to load images in *mini-batches*. For each channel (or a single channel in case of grayscale images) we will calculate mean and standard deviation. This values will be used to normalize the input data.\n\n#%%\n\ndataset = ImageFoldersDataset(\n path_to_data='../data/cpu/train',\n transform=simple_transform\n)\n\n\n# Notice that even for small images, ie. `(224, 224, 3)` calculating means of each channel requires performing operations on vectors of sizes: `n_samples * 224 * 224 * 3` (on an ordinary laptop problems with lack of memory may occur). Thus, for benchmark purposes we used smaller images with output shape of `(32, 32, 1)`.\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n \"from collections import defaultdict\\n\\nmeans, stds = defaultdict(list), defaultdict(list)\\ncounter = 0\\nfor data in dataset.loader(batch_size=170):\\n images, _ = data\\n for channel in list(range(N_CHANNELS)):\\n means[channel].append(images[:, channel, :, :].mean().item())\\n stds[channel].append(images[:, channel, :, :].std().item())\\n counter += 1\\n if counter % 10 == 0:\\n print('Mean calculated for {n} batches'.format(n=counter / (17000 / 170)))\\n# save means and stds\\nmeans = [np.mean(means[channel]) for channel in list(range(N_CHANNELS))]\\nstds = [np.mean(stds[channel]) for channel in list(range(N_CHANNELS))]\")\n\n\n# Then, we add a transformation that normalizes the input images and load into memory the number of images defined in a variable `SAMPLE` (or the entire dataset if `SAMPLE` is `None`.\n\n#%%\n\nsimple_transform = transforms.Compose([\n transforms.Grayscale(),\n transforms.Resize(32),\n transforms.RandomCrop(32),\n transforms.ToTensor(),\n transforms.Normalize(mean=means, std=stds)\n])\n\n#%%\n\ndataset_normalized = ImageFoldersDataset(\n path_to_data='../data/cpu/train/',\n transform=simple_transform\n)\nif SAMPLE:\n X_train, y_train = dataset_normalized.load_all_images(\n img_shape=(32, 32, 1))\n random_sample_idx = np.random.randint(\n low=0, high=len(X_train), size=SAMPLE)\n X_train, y_train = X_train[random_sample_idx], y_train[random_sample_idx]\nelse:\n X_train, y_train = dataset_normalized.load_all_images(\n img_shape=(32, 32, 1))\n\n\n# ***Important note about reproducibility***.\n#\n# Completely reproducible results are not guaranteed across PyTorch releases, individual commits or different platforms. Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds.\n#\n# However, in order to make computations deterministic on specific problem on one specific platform and Pytorch release, there are a couple of steps to take.\n#\n# There are two pseudorandom number generators involved in PyTorch, which we had to seed manually to made runs reproducible. We implemented `tmle.dataloaders.ImageFolderDataset` setting seed as follows:\n#\n# torch.manual_seed(seed)\n# torch.cuda.manual_seed(seed)\n# np.random.seed(seed)\n# random.seed(seed)\n\n# # Shallow Classifier\n\n#%%\n\n# Classes are imbalanced.\n\n#%%\n\nnp.bincount(y_train.astype('int64')) / y_train.shape[0]\n\n\n# We start with the definition of a simple `Pipeline`. It will be used for presentation purposes. With ***default*** values of *hyperparameters* we encounter a problem of overfitting.\n\n#%%\n\npipeline = Pipeline(steps=[\n ('hog', HOGTransformer(\n img_shape=(32, 32),\n orientations=9,\n pixels_per_cell=(8, 8),\n cells_per_block=(2, 2))\n ),\n ('scaler', StandardScaler()),\n ('svm', LinearSVC())\n])\n\n\n# Fit `pipeline` to training set.\n\n#%%\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE',\n \"pipeline.fit(X_train, y_train)\\n# save model\\nfrom joblib import dump\\ndump(pipeline, '../models/shallow_clf_starter.joblib')\")\n\n\n# Load model from disk.\n\n#%%\n\npipeline = load('../models/shallow_clf_starter.joblib')\n\n\n# Make predictions on training set.\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'y_train_preds = pipeline.predict(X_train)')\n\n\n# Measure performance with `balanced_accuracy_score` and prepare `classification_report`.\n\n#%%\n\nprint('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(\n acc=accuracy_score(y_train, y_train_preds),\n bal_acc=balanced_accuracy_score(y_train, y_train_preds)\n))\n\n#%%\n\nprint(classification_report(y_train, y_train_preds))\n\n\n# Load test set.\n\n#%%\n\ntest_dataset = ImageFoldersDataset(\n path_to_data='../data/cpu/test/',\n transform=simple_transform\n)\nX_test, y_test = test_dataset.load_all_images(img_shape=(32, 32, 1))\ny_test_preds = pipeline.predict(X_test)\n\n\n# Measure performance on test set.\n\n#%%\n\nprint('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(\n acc=accuracy_score(y_test, y_test_preds),\n bal_acc=balanced_accuracy_score(y_test, y_test_preds)\n))\n\n\n# For sure, performance of the model can be better.\n\n# # Hyperparameters tuning\n\n# We start from definition of `Pipeline` which will be fed with hyperparameters values sampled from space defined in next subsection. Our `Pipeline` takes three steps:\n#\n# * `HOGTransformer` which will convert images into feature vector based on histograms of oriented gradients,\n# * `StandardScaler` which scales output of `HOGTransformer` to avoid attributes in greater numeric ranges dominating those in smaller numeric ranges (however, the `HOG` should be on a similar scale),\n# * `LinearSVC` which will classify the images. It scales good in terms of both: number of instances and number of features.\n\n#%%\n\npipe = Pipeline(steps=[\n ('hog', HOGTransformer(img_shape=(32, 32))),\n ('scaler', StandardScaler()),\n ('svm', LinearSVC(max_iter=20000))\n])\n\n\n# ## Define hyperparameters space\n#\n# Our definition of *hyperparameters space* encourages `TPE` algorithm to suggest `Pipelines` which differs not only in terms of classifier, but also in terms of operations applied to data in *preprocessing* stage.\n\n#%%\n\nspace = dict()\nspace['hog__orientations'] = hp.choice('orientations', [9, 12, 18])\nspace['hog__pixels_per_cell'] = hp.choice('pixels_per_cell', [(4, 4), (8, 8)])\nspace['hog__cells_per_block'] = hp.choice(\n 'cells_per_block', [(1, 1), (2, 2), (4, 4)])\nspace['hog__block_norm'] = hp.choice('block_norm', ['L1', 'L2-Hys'])\nspace['svm__loss'] = hp.choice('loss', ['hinge', 'squared_hinge'])\nspace['svm__class_weight'] = hp.choice('class_weight', [None, 'balanced'])\nspace['svm__C'] = hp.uniform('C', 0.0001, 1)\n\n\n# ## Conduct experiments\n#\n# We will conduct experiments in conditions of imbalanced dataset. Therefore, we will use:\n#\n# * `sklearn.model_selection.StratifiedKFold` in order to preserve the comparable share of instances from given classes in both: training and validation sets,\n# * `sklearn.metrics.balanced_accuracy_score` to measure a performance of given classifier on both: training and validation sets. It is defined as the average of recall obrained on each class. The best value is 1 and the worst value is 0. Our loss function was defined as: `1 - mean_balanced_accuracy_score(X_validation)` (when `overfit_penalty` is not `None` then constant is added to losses when `mean_balanced_accuracy_score(X_train) - mean_balanced_accuracy_score(X_validation) > overfit_penalty`.\n\n#%%\n\nclf_optim = ClassifierOptimizer(\n classifier=pipe,\n space=space,\n metric=balanced_accuracy_score\n)\n\n#%%\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE', 'import warnings\\nwarnings.filterwarnings(\"ignore\")\\n\\nclf_optim.find_best_params(\\n X_train,\\n y_train,\\n experiments_path=\\'../experiments/\\',\\n experiments_name=EXPERIMENT_NAME,\\n max_evals=500,\\n overfit_penalty=0.1\\n)')\n\n\n# # Measure performance on test set\n\n# We will load information about the process of parameters optimization. We will use the best set of parameters to train classifier on whole training set.\n\n#%%\n\nTRIALS_PATH = os.path.join(\n '../experiments/', '.'.join([EXPERIMENT_NAME, 'hpopt']))\n\nwith open(TRIALS_PATH, 'rb') as trials:\n trials = pickle.load(trials)\n\n\n# The following dictionary shows the best set of parameters.\n\n#%%\n\nclf_optim.space_eval(trials.best_trial)\n\n\n# We will update `pipe` with *hyperparameters* values.\n\n#%%\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE',\n \"pipe.set_params(**clf_optim.space_eval(trials.best_trial))\\npipe.fit(X_train, y_train)\\n# save model\\nfrom joblib import dump\\ndump(pipe, '../models/shallow_clf.joblib')\")", "target_code": "pipe = load('../models/shallow_clf.joblib')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom tmle.visualizations import plot_confusion_matrix\nfrom sklearn.metrics import confusion_matrix\nimport pickle\nfrom sklearn.metrics import balanced_accuracy_score\nfrom tmle.model_selection import ClassifierOptimizer\nfrom hyperopt import tpe, fmin, hp, Trials, STATUS_OK\nimport hyperopt\nfrom sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score\nfrom joblib import load\nfrom sklearn.svm import LinearSVC\nfrom sklearn.preprocessing import StandardScaler\nfrom tmle.transformers import HOGTransformer\nfrom sklearn.pipeline import Pipeline\nfrom torchvision import transforms\nfrom tmle.dataloaders import ImageFoldersDataset\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# Computations performed in this notebook may take hours. Therefore, we have implemented notebook extension called `skip_cell` which could be used to omit certain cells. We defined global variables `IMPATIENCE` which should be set to `True` when you want to load data and/or models from disk.\n\n\nget_ipython().run_line_magic('load_ext', 'skip_cell')\n\n\nIMPATIENCE = True\n\n\n# If `SAMPLE` is `None` the entire dataset will be used during training.\n# `N_CHANNELS` controls the number of channels in images (for `grayscale` use `1`, for `RGB` use `3`).\n# `EXPERIMENT_NAME` defines the name of a file that stores information about results of Bayesian optimization of hyperparameters.\n\n\nSAMPLE = None\nN_CHANNELS = 1\nEXPERIMENT_NAME = 'shallow_clf_tpe_pipeline_cpu_overfit_penalty'\n\n\n# # Data preparation\n#\n# We start from loading data into memory.\n\n\n# Images (both in training and test sets) have different, unregular sizes. Thus, we will use transformations that will first scale the images and then crop (randomly) a square-shaped fragments. After that, images will be converted to `torch.Tensors`.\n#\n# Note that:\n#\n# * `transforms.Resize` will resize the input PIL Image to given size. If size is an int, smaller edge of the image will be mathced to this number, ie. if `height > width`, then image will be rescaled to `(size * height / width, size)`,\n# * `transforms.RandomCrop` will crop the given PIL Image at a random location. If size is an int instead of sequence like `(height, width)`, a square crop `(size, size)` is made.\n\n\nsimple_transform = transforms.Compose([\n transforms.Grayscale(),\n transforms.Resize(32),\n transforms.RandomCrop(32),\n transforms.ToTensor()\n])\n\n\n# Then, we will create an object that will allow us to load images in *mini-batches*. For each channel (or a single channel in case of grayscale images) we will calculate mean and standard deviation. This values will be used to normalize the input data.\n\n\ndataset = ImageFoldersDataset(\n path_to_data='../data/cpu/train',\n transform=simple_transform\n)\n\n\n# Notice that even for small images, ie. `(224, 224, 3)` calculating means of each channel requires performing operations on vectors of sizes: `n_samples * 224 * 224 * 3` (on an ordinary laptop problems with lack of memory may occur). Thus, for benchmark purposes we used smaller images with output shape of `(32, 32, 1)`.\n\n\nget_ipython().run_cell_magic('time', '',\n \"from collections import defaultdict\\n\\nmeans, stds = defaultdict(list), defaultdict(list)\\ncounter = 0\\nfor data in dataset.loader(batch_size=170):\\n images, _ = data\\n for channel in list(range(N_CHANNELS)):\\n means[channel].append(images[:, channel, :, :].mean().item())\\n stds[channel].append(images[:, channel, :, :].std().item())\\n counter += 1\\n if counter % 10 == 0:\\n print('Mean calculated for {n} batches'.format(n=counter / (17000 / 170)))\\n# save means and stds\\nmeans = [np.mean(means[channel]) for channel in list(range(N_CHANNELS))]\\nstds = [np.mean(stds[channel]) for channel in list(range(N_CHANNELS))]\")\n\n\n# Then, we add a transformation that normalizes the input images and load into memory the number of images defined in a variable `SAMPLE` (or the entire dataset if `SAMPLE` is `None`.\n\n\nsimple_transform = transforms.Compose([\n transforms.Grayscale(),\n transforms.Resize(32),\n transforms.RandomCrop(32),\n transforms.ToTensor(),\n transforms.Normalize(mean=means, std=stds)\n])\n\n\ndataset_normalized = ImageFoldersDataset(\n path_to_data='../data/cpu/train/',\n transform=simple_transform\n)\nif SAMPLE:\n X_train, y_train = dataset_normalized.load_all_images(\n img_shape=(32, 32, 1))\n random_sample_idx = np.random.randint(\n low=0, high=len(X_train), size=SAMPLE)\n X_train, y_train = X_train[random_sample_idx], y_train[random_sample_idx]\nelse:\n X_train, y_train = dataset_normalized.load_all_images(\n img_shape=(32, 32, 1))\n\n\n# ***Important note about reproducibility***.\n#\n# Completely reproducible results are not guaranteed across PyTorch releases, individual commits or different platforms. Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds.\n#\n# However, in order to make computations deterministic on specific problem on one specific platform and Pytorch release, there are a couple of steps to take.\n#\n# There are two pseudorandom number generators involved in PyTorch, which we had to seed manually to made runs reproducible. We implemented `tmle.dataloaders.ImageFolderDataset` setting seed as follows:\n#\n# torch.manual_seed(seed)\n# torch.cuda.manual_seed(seed)\n# np.random.seed(seed)\n# random.seed(seed)\n\n# # Shallow Classifier\n\n\n# Classes are imbalanced.\n\n\nnp.bincount(y_train.astype('int64')) / y_train.shape[0]\n\n\n# We start with the definition of a simple `Pipeline`. It will be used for presentation purposes. With ***default*** values of *hyperparameters* we encounter a problem of overfitting.\n\n\npipeline = Pipeline(steps=[\n ('hog', HOGTransformer(\n img_shape=(32, 32),\n orientations=9,\n pixels_per_cell=(8, 8),\n cells_per_block=(2, 2))\n ),\n ('scaler', StandardScaler()),\n ('svm', LinearSVC())\n])\n\n\n# Fit `pipeline` to training set.\n\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE',\n \"pipeline.fit(X_train, y_train)\\n# save model\\nfrom joblib import dump\\ndump(pipeline, '../models/shallow_clf_starter.joblib')\")\n\n\n# Load model from disk.\n\n\npipeline = load('../models/shallow_clf_starter.joblib')\n\n\n# Make predictions on training set.\n\n\nget_ipython().run_cell_magic('time', '', 'y_train_preds = pipeline.predict(X_train)')\n\n\n# Measure performance with `balanced_accuracy_score` and prepare `classification_report`.\n\n\nprint('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(\n acc=accuracy_score(y_train, y_train_preds),\n bal_acc=balanced_accuracy_score(y_train, y_train_preds)\n))\n\n\nprint(classification_report(y_train, y_train_preds))\n\n\n# Load test set.\n\n\ntest_dataset = ImageFoldersDataset(\n path_to_data='../data/cpu/test/',\n transform=simple_transform\n)\nX_test, y_test = test_dataset.load_all_images(img_shape=(32, 32, 1))\ny_test_preds = pipeline.predict(X_test)\n\n\n# Measure performance on test set.\n\n\nprint('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(\n acc=accuracy_score(y_test, y_test_preds),\n bal_acc=balanced_accuracy_score(y_test, y_test_preds)\n))\n\n\n# For sure, performance of the model can be better.\n\n# # Hyperparameters tuning\n\n# We start from definition of `Pipeline` which will be fed with hyperparameters values sampled from space defined in next subsection. Our `Pipeline` takes three steps:\n#\n# * `HOGTransformer` which will convert images into feature vector based on histograms of oriented gradients,\n# * `StandardScaler` which scales output of `HOGTransformer` to avoid attributes in greater numeric ranges dominating those in smaller numeric ranges (however, the `HOG` should be on a similar scale),\n# * `LinearSVC` which will classify the images. It scales good in terms of both: number of instances and number of features.\n\n\npipe = Pipeline(steps=[\n ('hog', HOGTransformer(img_shape=(32, 32))),\n ('scaler', StandardScaler()),\n ('svm', LinearSVC(max_iter=20000))\n])\n\n\n# ## Define hyperparameters space\n#\n# Our definition of *hyperparameters space* encourages `TPE` algorithm to suggest `Pipelines` which differs not only in terms of classifier, but also in terms of operations applied to data in *preprocessing* stage.\n\n\nspace = dict()\nspace['hog__orientations'] = hp.choice('orientations', [9, 12, 18])\nspace['hog__pixels_per_cell'] = hp.choice('pixels_per_cell', [(4, 4), (8, 8)])\nspace['hog__cells_per_block'] = hp.choice(\n 'cells_per_block', [(1, 1), (2, 2), (4, 4)])\nspace['hog__block_norm'] = hp.choice('block_norm', ['L1', 'L2-Hys'])\nspace['svm__loss'] = hp.choice('loss', ['hinge', 'squared_hinge'])\nspace['svm__class_weight'] = hp.choice('class_weight', [None, 'balanced'])\nspace['svm__C'] = hp.uniform('C', 0.0001, 1)\n\n\n# ## Conduct experiments\n#\n# We will conduct experiments in conditions of imbalanced dataset. Therefore, we will use:\n#\n# * `sklearn.model_selection.StratifiedKFold` in order to preserve the comparable share of instances from given classes in both: training and validation sets,\n# * `sklearn.metrics.balanced_accuracy_score` to measure a performance of given classifier on both: training and validation sets. It is defined as the average of recall obrained on each class. The best value is 1 and the worst value is 0. Our loss function was defined as: `1 - mean_balanced_accuracy_score(X_validation)` (when `overfit_penalty` is not `None` then constant is added to losses when `mean_balanced_accuracy_score(X_train) - mean_balanced_accuracy_score(X_validation) > overfit_penalty`.\n\n\nclf_optim = ClassifierOptimizer(\n classifier=pipe,\n space=space,\n metric=balanced_accuracy_score\n)\n\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE', 'import warnings\\nwarnings.filterwarnings(\"ignore\")\\n\\nclf_optim.find_best_params(\\n X_train,\\n y_train,\\n experiments_path=\\'../experiments/\\',\\n experiments_name=EXPERIMENT_NAME,\\n max_evals=500,\\n overfit_penalty=0.1\\n)')\n\n\n# # Measure performance on test set\n\n# We will load information about the process of parameters optimization. We will use the best set of parameters to train classifier on whole training set.\n\n\nTRIALS_PATH = os.path.join(\n '../experiments/', '.'.join([EXPERIMENT_NAME, 'hpopt']))\n\nwith open(TRIALS_PATH, 'rb') as trials:\n trials = pickle.load(trials)\n\n\n# The following dictionary shows the best set of parameters.\n\n\nclf_optim.space_eval(trials.best_trial)\n\n\n# We will update `pipe` with *hyperparameters* values.\n\n\nget_ipython().run_cell_magic('skip', '$IMPATIENCE',\n \"pipe.set_params(**clf_optim.space_eval(trials.best_trial))\\npipe.fit(X_train, y_train)\\n# save model\\nfrom joblib import dump\\ndump(pipe, '../models/shallow_clf.joblib')\")\n\n\n\n", "project_metadata": {"full_name": "stasulam/tmle", "description": "Image classification using the classical computer vision approach and transfer learning (with architectures such as ResNet, DenseNet, etc.).", "topics": ["machine-learning", "image-classification", "deep-learning", "transfer-learning", "pytorch", "pytorch-cnn"], "git_url": "git://github.com/stasulam/tmle.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2019-02-28T17:43:29Z", "size": 16584, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6765938, "Python": 27226}, "last_updated": "2020-06-09T14:20:03Z"}, "intent": "# Load model from disk."}, {"original_comment": "# Encode categorical integer features as a one-hot numeric array\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Introduction to Scikit-Learn (sklearn)\n#\n# This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn library.\n#\n# What we're going to cover:\n# 0. An end-to-end Scikit-Learn workflow\n# 1. Getting the data ready\n# 2. Choose the right estimator/algorithm for our problems\n# 3. Fit the model/algorithm and use it to make predictions on our data\n# 4. Evaluating a model\n# 5. Improve a model\n# 6. Save and load a trained model\n# 7. Putting it all together!\n\n#%%\n\n# Let's listify the contents\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom joblib import dump, load\nimport pickle\nfrom sklearn.model_selection import GridSearchCV, train_test_split\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import plot_confusion_matrix\nimport seaborn as sns\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import roc_curve\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import Ridge\nfrom sklearn.datasets import load_boston\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder\nimport pickle # Python object serialization\nfrom sklearn.metrics import classification_report, confusion_matrix, accuracy_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nimport sklearn\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nwhat_were_covering = [\n \"0. An end-to-end Scikit-Learn workflow\",\n \"1. Getting the data ready\",\n \"2. Choose the right estimator/algorithm for our problems\",\n \"3. Fit the model/algorithm and use it to make predictions on our data\",\n \"4. Evaluating a model\",\n \"5. Improve a model\",\n \"6. Save and load a trained model\",\n \"7. Putting it all together!\"]\n\n#%%\n\n# Standard imports\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## 0. An end-to-end Scikit-Learn workflow\n\n#%%\n\n# 1. Get the data ready\nheart_disease = pd.read_csv(\"data/heart-disease.csv\")\nheart_disease\n\n#%%\n\n# Create X (features matrix) choose from age to thal\nX = heart_disease.drop(\"target\", axis=1)\n\n# Create y (labels)\ny = heart_disease[\"target\"] # 0: no heart disease, 1: got heart disease\n\n#%%\n\nX\n\n#%%\n\ny\n\n#%%\n\nsklearn.show_versions()\n\n#%%\n\n# 2. Choose the right model and hyperparameters\nclf = RandomForestClassifier(n_estimators=100)\n\n# We'll keep the default hyperparameters\nclf.get_params()\n\n#%%\n\n# 3. Fit the model to the training data\n\n# test_size=0.2, 80% of data for training and 20% for testing\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\n#%%\n\n# Build a forest of trees from the training set (X, y)\nclf.fit(X_train, y_train)\n\n#%%\n\n# make a prediction\ny_preds = clf.predict(np.array(X_test))\ny_preds\n\n#%%\n\ny_test\n\n#%%\n\n# 4. Evaluate the model on the training data and test data\n\n# Returns the mean accuracy on the given test data and labels\nclf.score(X_train, y_train)\n\n#%%\n\nclf.score(X_test, y_test)\n\n#%%\n\nprint(classification_report(y_test, y_preds))\n\n\n# - The precision will be \"how many are correctly classified among that class\"\n# - The recall means \"how many of this class you find over the whole number of element of this class\"\n# - The f1-score is the harmonic mean between precision & recall\n# - The support is the number of occurence of the given class in your dataset (so you have 37.5K of class 0 and 37.5K of class 1, which is a really well balanced dataset.\n\n#%%\n\n# Compute confusion matrix to evaluate the accuracy of a classification\nconfusion_matrix(y_test, y_preds)\n\n#%%\n\n# Accuracy classification score\naccuracy_score(y_test, y_preds)\n\n#%%\n\n# 5. Improve a model\n# Try different amount of n_estimators\nnp.random.seed(42)\nfor i in range(10, 100, 10):\n print(f\"Trying model with {i} estimators...\")\n clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)\n print(\n f\"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%\")\n print(\"\")\n\n#%%\n\n# 6. Save a model and load it\n\npickle.dump(clf, open(\"random_forst_model_1.pkl\", \"wb\")) # write binary\n\n#%%\n\nloaded_model = pickle.load(\n open(\"random_forst_model_1.pkl\", \"rb\")) # read binary\nloaded_model.score(X_test, y_test)\n\n\n# ## 1. Getting our data ready to be used with machine learning\n#\n# Three main things we have to do:\n# 1. Split the data into features and labels (usually `X` & `y`)\n# 2. Filling (also called imputing) or disregarding missing values\n# 3. Converting non-numerical values to numerical values (also called feature encoding)\n\n#%%\n\nheart_disease.head()\n\n#%%\n\n# Split the data into features and labels (usually X & y)\nX = heart_disease.drop(\"target\", axis=1)\nX.head()\n\n#%%\n\n# Split the data into features and labels (usually X & y)\ny = heart_disease[\"target\"]\ny.head()\n\n#%%\n\n# Split the data into training and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\n#%%\n\nX_train.shape, X_test.shape, y_train.shape, y_test.shape\n\n#%%\n\nX.shape[0] * 0.8\n\n#%%\n\n242 + 61\n\n#%%\n\nlen(heart_disease)\n\n\n# ### 1.1 Make sure it's all numerical\n\n#%%\n\ncar_sales = pd.read_csv(\"data/car-sales-extended.csv\")\ncar_sales.head()\n\n#%%\n\n# treat doors as categorical\ncar_sales[\"Doors\"].value_counts()\n\n#%%\n\nlen(car_sales)\n\n#%%\n\ncar_sales.dtypes\n\n#%%\n\n# Split into X/y\nX = car_sales.drop(\"Price\", axis=1)\ny = car_sales[\"Price\"]\n\n#%%\n\nX.head()\n\n\n# \n\n#%%\n\ndummies = pd.get_dummies(car_sales[[\"Make\", \"Colour\", \"Doors\"]])\ndummies\n\n#%%\n\n# Turn the categories into numbers", "target_code": "from sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import OneHotEncoder\n\none_hot = OneHotEncoder()\n\ntransformer = ColumnTransformer(\n [(\"one_hot\", one_hot, categorical_features)], remainder=\"passthrough\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Introduction to Scikit-Learn (sklearn)\n#\n# This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn library.\n#\n# What we're going to cover:\n# 0. An end-to-end Scikit-Learn workflow\n# 1. Getting the data ready\n# 2. Choose the right estimator/algorithm for our problems\n# 3. Fit the model/algorithm and use it to make predictions on our data\n# 4. Evaluating a model\n# 5. Improve a model\n# 6. Save and load a trained model\n# 7. Putting it all together!\n\n\n# Let's listify the contents\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom joblib import dump, load\nimport pickle\nfrom sklearn.model_selection import GridSearchCV, train_test_split\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import plot_confusion_matrix\nimport seaborn as sns\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import roc_curve\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import Ridge\nfrom sklearn.datasets import load_boston\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.ensemble import RandomForestRegressor\nimport pickle # Python object serialization\nfrom sklearn.metrics import classification_report, confusion_matrix, accuracy_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nimport sklearn\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nwhat_were_covering = [\n \"0. An end-to-end Scikit-Learn workflow\",\n \"1. Getting the data ready\",\n \"2. Choose the right estimator/algorithm for our problems\",\n \"3. Fit the model/algorithm and use it to make predictions on our data\",\n \"4. Evaluating a model\",\n \"5. Improve a model\",\n \"6. Save and load a trained model\",\n \"7. Putting it all together!\"]\n\n\n# Standard imports\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## 0. An end-to-end Scikit-Learn workflow\n\n\n# 1. Get the data ready\nheart_disease = pd.read_csv(\"data/heart-disease.csv\")\nheart_disease\n\n\n# Create X (features matrix) choose from age to thal\nX = heart_disease.drop(\"target\", axis=1)\n\n# Create y (labels)\ny = heart_disease[\"target\"] # 0: no heart disease, 1: got heart disease\n\n\nX\n\n\ny\n\n\nsklearn.show_versions()\n\n\n# 2. Choose the right model and hyperparameters\nclf = RandomForestClassifier(n_estimators=100)\n\n# We'll keep the default hyperparameters\nclf.get_params()\n\n\n# 3. Fit the model to the training data\n\n# test_size=0.2, 80% of data for training and 20% for testing\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\n\n# Build a forest of trees from the training set (X, y)\nclf.fit(X_train, y_train)\n\n\n# make a prediction\ny_preds = clf.predict(np.array(X_test))\ny_preds\n\n\ny_test\n\n\n# 4. Evaluate the model on the training data and test data\n\n# Returns the mean accuracy on the given test data and labels\nclf.score(X_train, y_train)\n\n\nclf.score(X_test, y_test)\n\n\nprint(classification_report(y_test, y_preds))\n\n\n# - The precision will be \"how many are correctly classified among that class\"\n# - The recall means \"how many of this class you find over the whole number of element of this class\"\n# - The f1-score is the harmonic mean between precision & recall\n# - The support is the number of occurence of the given class in your dataset (so you have 37.5K of class 0 and 37.5K of class 1, which is a really well balanced dataset.\n\n\n# Compute confusion matrix to evaluate the accuracy of a classification\nconfusion_matrix(y_test, y_preds)\n\n\n# Accuracy classification score\naccuracy_score(y_test, y_preds)\n\n\n# 5. Improve a model\n# Try different amount of n_estimators\nnp.random.seed(42)\nfor i in range(10, 100, 10):\n print(f\"Trying model with {i} estimators...\")\n clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)\n print(\n f\"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%\")\n print(\"\")\n\n\n# 6. Save a model and load it\n\npickle.dump(clf, open(\"random_forst_model_1.pkl\", \"wb\")) # write binary\n\n\nloaded_model = pickle.load(\n open(\"random_forst_model_1.pkl\", \"rb\")) # read binary\nloaded_model.score(X_test, y_test)\n\n\n# ## 1. Getting our data ready to be used with machine learning\n#\n# Three main things we have to do:\n# 1. Split the data into features and labels (usually `X` & `y`)\n# 2. Filling (also called imputing) or disregarding missing values\n# 3. Converting non-numerical values to numerical values (also called feature encoding)\n\n\nheart_disease.head()\n\n\n# Split the data into features and labels (usually X & y)\nX = heart_disease.drop(\"target\", axis=1)\nX.head()\n\n\n# Split the data into features and labels (usually X & y)\ny = heart_disease[\"target\"]\ny.head()\n\n\n# Split the data into training and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n\n\nX_train.shape, X_test.shape, y_train.shape, y_test.shape\n\n\nX.shape[0] * 0.8\n\n\n242 + 61\n\n\nlen(heart_disease)\n\n\n# ### 1.1 Make sure it's all numerical\n\n\ncar_sales = pd.read_csv(\"data/car-sales-extended.csv\")\ncar_sales.head()\n\n\n# treat doors as categorical\ncar_sales[\"Doors\"].value_counts()\n\n\nlen(car_sales)\n\n\ncar_sales.dtypes\n\n\n# Split into X/y\nX = car_sales.drop(\"Price\", axis=1)\ny = car_sales[\"Price\"]\n\n\nX.head()\n\n\n# \n\n\ndummies = pd.get_dummies(car_sales[[\"Make\", \"Colour\", \"Doors\"]])\ndummies\n\n\n# Turn the categories into numbers\ncategorical_features = [\"Make\", \"Colour\", \"Doors\"]\n", "project_metadata": {"full_name": "chesterheng/machinelearning-datascience", "description": "Complete Machine Learning and Data Science: Zero to Mastery", "topics": ["machine-learning", "data-science"], "git_url": "git://github.com/chesterheng/machinelearning-datascience.git", "stars": 11, "watchers": 11, "forks": 6, "created": "2020-05-10T09:38:22Z", "size": 81175, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4070181}, "last_updated": "2020-12-07T17:13:28Z"}, "intent": "# Encode categorical integer features as a one-hot numeric array"}, {"original_comment": "# converting into lowercase\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![](https://static.wixstatic.com/media/80a58d_adc900710a474cd091d5dae9649734f9~mv2.png/v1/fill/w_812,h_353,al_c,lg_1/80a58d_adc900710a474cd091d5dae9649734f9~mv2.png)\n\n# # More To Come. Stay Tuned. !!\n# If there are any suggestions/changes you would like to see in the Kernel please let me know :). Appreciate every ounce of help!\n#\n# **This notebook will always be a work in progress.** Please leave any comments about further improvements to the notebook! Any feedback or constructive criticism is greatly appreciated!. **If you like it or it helps you , you can upvote and/or leave a comment :).**\n\n# - 1. Introduction\n# - 2. Retrieving the Data\n# - 2.1 Load libraries\n# - 2.2 Read the Data\n# - 3. Glimpse of Data\n# - 3.1 Overview of tables\n# - 3.2 Statistical overview of the Data\n# - 4. Data preparation\n# - 4.1 Check for missing data\n# - 5. Data Exploration\n# - 5.1 Project proposal is Approved or not ?\n# - 5.2 Distribution\n# - 5.2.a Distribution of School states\n# - 5.2.b Distribution of project_grade_category (school grade levels (PreK-2, 3-5, 6-8, and 9-12))\n# - 5.2.c Distribution of category of the project\n# - 5.2.d Distribution of number of previously posted applications by the submitting teacher\n# - 5.2.e Distribution of subcategory of the project\n# - 5.2.f Distribution of Project titles\n# - 5.2.g Distribution of price of resource requested\n# - 5.2.h Distribution of quantity of resource requested\n# - 5.2.i Teacher prefix Distribution\n# - 5.3 Top resources needed for the project\n# - 5.4 Word Cloud of resources requested\n# - 5.5 Various popularities in terms of project acceptance rate and project rejection rate\n# - 5.5.a Popular School states in terms of project acceptance rate and project rejection rate\n# - 5.5.b Popular Teacher Prefix in terms of project acceptance rate and project rejection rate\n# - 5.5.c Popular school grade levels in terms of project acceptance rate and project rejection rate\n# - 5.5.d Popular category of the project in terms of project acceptance rate and project rejection rate\n# - 5.5.e Popular subcategory of the project in terms of project acceptance rate and project rejection rate\n# - 5.5.f Popular project titles in terms of project acceptance rate and project rejection rate\n# - 5.6 Project Proposals by US States\n# - 5.7 Project Proposals Mean Acceptance Rate by US States\n# - 5.8 Correlation Matrix and HeatMap of training data\n# - 5.8.a Teacher_prefix and project_is_approved Intervals Correlation\n# - 5.8.b Teacher_number_of_previously_posted_projects and project_is_approved Intervals Correlation\n# - 5.8.c Correlation Matrix and Heatmap of training data\n# - 5.9 Project Submission Time Analysis\n# - 5.9.a Project Submission Month Analysis\n# - 5.9.b Project Submission Weekday Analysis\n# - 5.9.c Project Submission Date Analysis\n# - 5.9.d Project Submission Hour Analysis\n# - 5.10 Top Keywords in project_essay_1\n# - 5.11 Top Keywords in project_essay_2\n# - 5.12 Top Keywords in project_resource_summary\n# - 5.13 Quantity V.S. Price\n# - 5.14 Gender Analysis\n# - 5.15 Month wise distribution of number of projects proposal submitted in each state\n# - 5.16 Price requested for resources distribution [I commented the code becuase of excessive rendering time but i wrote the results]\n# - 5.16.a Price requested for resources distribution by different states\n# - 5.16.b Price requested for resources distribution by Teacher prefixes\n# - 5.16.c Price requested for resources distribution by different Genders\n# - 5.16.d Price requested for resources distribution by different project_grade_category\n# - 5.17 CA(California)\n# - 5.17.a Popularities of Teacher prefixes in California\n# - 5.17.b Popularities of school grade levels in California\n# - 5.17.c Top project titles in California\n# - 5.17.d Trend of project submission time in California\n# - 5.18 TX(Texas)\n# - 5.18.a Popularities of Teacher prefixes in Texas\n# - 5.18.b Popularities of school grade levels in Texas\n# - 5.18.c Top project titles in Texas\n# - 5.18.d Trend of project submission time in Texas\n# - 6. Brief Summary/Conclusion :\n\n# ## 1. Intoduction\n\n# **About DonorsChoose:**\n#\n# DonorsChoose.org is a United States\u2013based 501(c)(3) nonprofit organization that allows individuals to donate directly to public school classroom projects. Founded in 2000 by former public school teacher Charles Best, DonorsChoose.org was among the first civic crowdfunding platforms of its kind. The organization has been given Charity Navigator\u2019s highest rating every year since 2005. In January 2018, they announced that 1 million projects had been funded. In 77% of public schools in the United States, at least one project has been requested on DonorsChoose.org. Schools from wealthy areas are more likely to make technology requests, while schools from less affluent areas are more likely to request basic supplies. It's been noted that repeat donors on DonorsChoose typically donate to projects they have no prior relationship with, and most often fund projects serving financially challenged students.\n#\n#\n# **Objective of this Notebook:**\n#\n# In this Notebook i will do Exploratory Analysis.\n#\n# **Objective of the competition:**\n#\n# DonorsChoose.org receives hundreds of thousands of project proposals each year for classroom projects in need of funding. Right now, a large number of volunteers is needed to manually screen each submission before it's approved to be posted on the DonorsChoose.org website.The goal of the competition is to predict whether or not a DonorsChoose.org project proposal submitted by a teacher will be approved, using the text of project descriptions as well as additional metadata about the project, teacher, and school. DonorsChoose.org can then use this information to identify projects most likely to need further review before approval.\n#\n\n# # 2. Retrieving the Data\n\n# ## 2.1 Load libraries\n\n#%%\n\nfrom wordcloud import WordCloud\nfrom nltk.corpus import stopwords\nimport re\nfrom subprocess import check_output\nimport warnings\nfrom matplotlib import cm\nfrom numpy import array\nfrom mpl_toolkits.basemap import Basemap\nimport squarify\nimport plotly.tools as tls\nimport plotly.offline as offline\nimport plotly.graph_objs as go\nfrom plotly.offline import init_notebook_mode, iplot\nimport plotly.offline as py\n# package for high-performance, easy-to-use data structures and data analysis\nimport pandas as pd\nimport numpy as np # fundamental package for scientific computing with Python\nimport matplotlib\nimport matplotlib.pyplot as plt # for plotting\nimport seaborn as sns # for making plots with seaborn\ncolor = sns.color_palette()\npy.init_notebook_mode(connected=True)\ninit_notebook_mode(connected=True)\noffline.init_notebook_mode()\n\n# Supress unnecessary warnings so that presentation looks clean\nwarnings.filterwarnings(\"ignore\")\n\n# Print all rows and columns\npd.set_option('display.max_columns', None)\npd.set_option('display.max_rows', None)\n\n#%%\n\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# ## 2.2 Read tha Data\n\n#%%\n\ntrain_data = pd.read_csv(\"../input/train.csv\")\ntest_data = pd.read_csv(\"../input/test.csv\")\nresources_data = pd.read_csv(\"../input/resources.csv\")\n\n## Merging with train and test data ##\ntrain_resource = pd.merge(train_data, resources_data, on=\"id\", how='left')\ntest_resource = pd.merge(test_data, resources_data, on=\"id\", how='left')\n\n#%%\n\nprint(\"Size of training data : \", train_data.shape)\nprint(\"Size of test data : \", test_data.shape)\nprint(\"Size of resource data : \", resources_data.shape)\nprint(\"Size of train_resource data : \", train_resource.shape)\nprint(\"Size of test_resource data : \", test_resource.shape)\n\n\n# # 3. Glimpse of Data\n\n# ## 3.1 Overview of tables\n\n# **Training Data**\n\n#%%\n\ntrain_data.head()\n\n\n# **Test Data**\n\n#%%\n\ntest_data.head()\n\n\n# **Resource Data**\n\n#%%\n\nresources_data.head()\n\n\n# **train_resource**\n\n#%%\n\ntrain_resource.head()\n\n\n# **test_resource**\n\n#%%\n\ntest_resource.head()\n\n\n# ## 3.2 Statistical Overview of the Data\n\n# **Training Data some little info**\n\n#%%\n\ntrain_data.info()\n\n\n# **Little description of training data for numerical features**\n#\n\n#%%\n\ntrain_data.describe()\n\n\n# **Little description of training data for categorical features**\n#\n\n#%%\n\ntrain_data.describe(include=[\"O\"])\n\n\n# **Little description of train_resource data for numerical features**\n#\n\n#%%\n\ntrain_resource.describe()\n\n\n# **Little description of train_resource data for categorical features**\n#\n\n#%%\n\ntrain_resource.describe(include=[\"O\"])\n\n\n# # 4. Data preparation\n\n# ## 4.1 Checking for missing data\n\n# **Missing data in train_data**\n\n#%%\n\n# checking missing data in training data\ntotal = train_data.isnull().sum().sort_values(ascending=False)\npercent = (train_data.isnull().sum()/train_data.isnull().count()\n * 100).sort_values(ascending=False)\nmissing_train_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_train_data.head()\n\n\n# * In training data, we can **project_essay_4** and **project_essay_3** having 96 % null values. so during prediction, better remove these 2 columns.\n\n# **Missing data in test_data**\n\n#%%\n\n# checking missing data in test data\ntotal = test_data.isnull().sum().sort_values(ascending=False)\npercent = (test_data.isnull().sum()/test_data.isnull().count()\n * 100).sort_values(ascending=False)\nmissing_test_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_test_data.head()\n\n\n# * In test data, we can **project_essay_4** and **project_essay_3** having 96 % null values. so during prediction, better remove these 2 columns.\n\n# **Missing data in resources_data**\n\n#%%\n\n# checking missing data in resource data\ntotal = resources_data.isnull().sum().sort_values(ascending=False)\npercent = (resources_data.isnull().sum() /\n resources_data.isnull().count()*100).sort_values(ascending=False)\nmissing_resources_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_resources_data.head()\n\n\n# * In resource data, only **description** column having few null values. So we can ignore these values.\n\n# # 5. Data Exploration\n\n# ## 5.1 Project proposal is Approved or not ?\n\n#%%\n\ntemp = train_data['project_is_approved'].value_counts()\nlabels = temp.index\nsizes = (temp / temp.sum())*100\ntrace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')\nlayout = go.Layout(title='Project proposal is approved or not')\ndata = [trace]\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Training data is highly imbalanced that is approx. 85 % projetcs were approved and 15 % project were not approved. Majority imbalanced class is positive.\n\n# ## 5.2 Distribution\n\n# ### 5.2.a Distribution of School states\n\n#%%\n\ntemp = train_data[\"school_state\"].value_counts()\n#print(\"Total number of states : \",len(temp))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of School states in % \",\n xaxis=dict(\n title='State Name',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 50 states, **California(CA)** having higher number of projects proposal submitted **approx. 14 %** followed by **Texas(TX)(7 %)** and **Tennessee(NY)(7 %)**.\n\n# ### 5.2.b Distribution of project_grade_category (school grade levels (PreK-2, 3-5, 6-8, and 9-12))\n\n#%%\n\ntemp = train_data[\"project_grade_category\"].value_counts()\nprint(\"Total number of project grade categories : \", len(temp))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of project_grade_category (school grade levels) in %\",\n xaxis=dict(\n title='school grade levels',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in % ',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 4 school grade levels, Project proposals submission in school grade levels is higher for **Grades Prek-2** which is approximately **41 %** followed by **Grades 3-5** which has approx. **34 %**.\n\n# ### 5.2.c Distribution of category of the project\n\n#%%\n\ntemp = train_data[\"project_subject_categories\"].value_counts().head(10)\nprint(\"Total number of project subject categories : \", len(\n train_data[\"project_subject_categories\"].value_counts()))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of category of the project in %\",\n xaxis=dict(\n title='category of the project',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 51 Project categories, Project proposals submission for project categories is higher for **Literacy & Language** which is approx. **27 %** followed by **Math & Science** which has approx. **20 %**.\n\n# ### 5.2.d Distribution of number of previously posted applications by the submitting teacher\n\n#%%\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_data['teacher_number_of_previously_posted_projects'])\nplt.xlabel(\n 'number of previously posted applications by the submitting teacher', fontsize=12)\nplt.title(\n \"Histogram of number of previously posted applications by the submitting teacher\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_data.shape[0]), np.sort(\n train_data.teacher_number_of_previously_posted_projects.values))\nplt.xlabel(\n 'number of previously posted applications by the submitting teacher', fontsize=12)\nplt.title(\n \"Distribution of number of previously posted applications by the submitting teacher\")\nplt.show()\n\n\n# ### 5.2.e Distribution of subcategory of the project\n\n#%%\n\ntemp = train_data[\"project_subject_subcategories\"].value_counts().head(10)\nprint(\"Total sub-categories of the projects : \",\n len(train_data[\"project_subject_subcategories\"]))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of subcategory of the project in %\",\n xaxis=dict(\n title='subcategory of the project',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 1,82,020 Project subcategories, Project proposals submission for project sub-categoriesis is higher for **Literacy** which is approx. **16 % ** followed by **Literacy & Mathematics** which has approx. **16 %** .\n\n# ### 5.2.f Distribution of Project titles\n\n#%%\n\ntemp = train_data[\"project_title\"].value_counts().head(10)\nprint(\"Total project titles are : \", len(train_data[\"project_title\"]))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of Distribution of Project titles in %\",\n xaxis=dict(\n title='Project Title',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 1,82,080 project titles, Project proposals submission for project titles is higher for **Flexible seating** which is approx. **27 %** followed by **Whiggle while your work** which has approx. **14 %**.\n\n# ### 5.2.g Distribution of price of resource requested\n\n#%%\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_resource['price'])\nplt.xlabel('Price', fontsize=12)\nplt.title(\"Histogran of price of resource requested\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_resource.shape[0]),\n np.sort(train_resource.price.values))\nplt.xlabel('price', fontsize=12)\nplt.title(\"Distribution of price of resource requested\")\nplt.show()\n\n\n# ### 5.2.h Distribution of quantity of resource requested\n\n#%%\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_resource['price'])\nplt.xlabel('quantity', fontsize=12)\nplt.title(\"Histogran of quantity of resource requested\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_resource.shape[0]), np.sort(\n train_resource.quantity.values))\nplt.xlabel('price', fontsize=12)\nplt.title(\"Distribution of quantity of resource requested\")\nplt.show()\n\n\n# ### 5.2.i Teacher prefix Distribution\n\n#%%\n\ntemp = train_data[\"teacher_prefix\"].value_counts()\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Teacher prefix Distribution in %\",\n xaxis=dict(\n title='Teacher prefix',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Higher number of project proposal submitted by **married womens** which is approx. **53 %** followed by **unmarried womens** which has approx. **37 %**.\n# * Project proposal submitted by **Teacher** which is approx. **2 %** is vey low as compared to **Mrs., Ms., Mr**.\n\n# ## 5.3 Top resources needed for the project\n\n#%%\n\nREPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\nBAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\nSTOPWORDS = set(stopwords.words('english'))\n\n\ndef text_prepare(text):\n \"\"\"\n text: a string\n\n return: modified initial string\n \"\"\"\n text = text.lower() # lowercase text\n # replace REPLACE_BY_SPACE_RE symbols by space in text\n text = REPLACE_BY_SPACE_RE.sub(' ', text)\n # delete symbols which are in BAD_SYMBOLS_RE from text\n text = BAD_SYMBOLS_RE.sub('', text)\n # delete stopwords from text\n temp = [s.strip() for s in text.split() if s not in STOPWORDS]\n new_text = ''\n for i in temp:\n new_text += i+' '\n text = new_text\n return text.strip()\n\n#%%\n\ntemp_data = train_data.dropna(subset=['project_resource_summary'])\n# converting into lowercase\ntemp_data['project_resource_summary'] = temp_data['project_resource_summary'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['project_resource_summary'] = temp_data['project_resource_summary'].map(\n text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['project_resource_summary'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Top resources needed for the project\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.4 Word Cloud of resources requested\n\n#%%\n\ntemp_data = train_resource.dropna(subset=['description'])\n# converting into lowercase\ntemp_data['description'] = temp_data['description'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['description'] = temp_data['description'].map(text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['description'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Word Cloud of resources requested\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.5 Various popularities in terms of project acceptance rate and project rejection rate\n\n# ### 5.5.a Popular School states in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"school_state\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"school_state\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"school_state\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular School states in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.b Popular Teacher Prefix in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"teacher_prefix\"].value_counts()\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"teacher_prefix\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"teacher_prefix\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular Teacher prefixes in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.c Popular school grade levels in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"project_grade_category\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_grade_category\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_grade_category\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular school grade levels in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.d Popular category of the project in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"project_subject_categories\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_categories\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_categories\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular category of the project in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.e Popular subcategory of the project in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"project_subject_subcategories\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_subcategories\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_subcategories\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular subcategory of the project in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.f Popular project titles in terms of project acceptance rate and project rejection rate\n\n#%%\n\ntemp = train_data[\"project_title\"].value_counts().head(20)\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"project_title\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"project_title\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular project titles in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ## 5.6 Project Proposals by US States\n\n#%%\n\ntemp = pd.DataFrame(train_data[\"school_state\"].value_counts()).reset_index()\ntemp.columns = ['state_code', 'num_proposals']\n\ndata = [dict(\n type='choropleth',\n locations=temp['state_code'],\n locationmode='USA-states',\n z=temp['num_proposals'].astype(float),\n text=temp['state_code'],\n colorscale='Red',\n marker=dict(line=dict(width=0.7)),\n colorbar=dict(autotick=False, tickprefix='',\n title='Number of project proposals'),\n )]\nlayout = dict(title='Project Proposals by US States', geo=dict(\n scope='usa',\n projection=dict(type='albers usa'),\n showlakes=True,\n lakecolor='rgb(255, 255, 255)'),\n)\nfig = dict(data=data, layout=layout)\npy.iplot(fig, validate=False)\n\n\n# ## 5.7 Project Proposals Mean Acceptance Rate by US States\n\n#%%\n\ntemp = pd.DataFrame(train_data.groupby(\"school_state\")[\n \"project_is_approved\"].apply(np.mean)).reset_index()\ntemp.columns = ['state_code', 'num_proposals']\n\ndata = [dict(\n type='choropleth',\n locations=temp['state_code'],\n locationmode='USA-states',\n z=temp['num_proposals'].astype(float),\n text=temp['state_code'],\n colorscale='Red',\n marker=dict(line=dict(width=0.7)),\n colorbar=dict(autotick=False, tickprefix='',\n title='Number of project proposals'),\n )]\nlayout = dict(title='Project Proposals Mean Acceptance Rate by US States', geo=dict(\n scope='usa',\n projection=dict(type='albers usa'),\n showlakes=True,\n lakecolor='rgb(255, 255, 255)'),\n)\nfig = dict(data=data, layout=layout)\npy.iplot(fig, validate=False)\n\n\n# ## 5.8 Correlation Matrix and HeatMap of training data\n\n# ### 5.8.a Teacher_prefix and project_is_approved Intervals Correlation\n\n#%%\n\ncols = ['teacher_prefix', 'project_is_approved']\ncm = sns.light_palette(\"red\", as_cmap=True)\npd.crosstab(train_data[cols[0]], train_data[cols[1]]\n ).style.background_gradient(cmap=cm)\n\n\n# ### 5.8.b Teacher_number_of_previously_posted_projects and project_is_approved Intervals Correlation\n\n#%%\n\ncols = ['teacher_number_of_previously_posted_projects', 'project_is_approved']\ncm = sns.light_palette(\"red\", as_cmap=True)\npd.crosstab(train_data[cols[0]], train_data[cols[1]]\n ).style.background_gradient(cmap=cm)\n\n\n# * Number of previously posted applications by the submitting teacher was** Zero(0)** having more number of acceptance rate.\n\n# ### 5.8.c Correlation Matrix and Heatmap of training data\n\n#%%\n\n# Correlation Matrix\ncorr = train_data.corr()\nplt.figure(figsize=(12, 12))\nsns.heatmap(corr,\n xticklabels=corr.columns.values,\n yticklabels=corr.columns.values, annot=True, cmap='cubehelix', square=True)\nplt.title('Correlation between different features')\ncorr\n\n\n# ## 5.9 Project Submission Time Analysis\n\n#%%\n\ntrain_data[\"project_submitted_datetime\"] = pd.to_datetime(\n train_data[\"project_submitted_datetime\"])\ntrain_data[\"month_created\"] = train_data[\"project_submitted_datetime\"].dt.month\ntrain_data[\"weekday_created\"] = train_data[\"project_submitted_datetime\"].dt.weekday\ntrain_data[\"date_created\"] = train_data[\"project_submitted_datetime\"].dt.date\ntrain_data[\"hour_created\"] = train_data[\"project_submitted_datetime\"].dt.hour\n\n\n# ### 5.9.a Project Submission Month Analysis\n\n#%%\n\ntemp = train_data[\"month_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"month_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"month_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission Month Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * **August month** has the second number of proposals followed by **September month** .\n\n# ### 5.9.b Project Submission Weekday Analysis\n\n#%%\n\ntemp = train_data[\"weekday_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"weekday_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"weekday_created\"] == val] == 0))\n\ntemp.index = [\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\"]\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission weekday Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * The number of proposals decreases as we move towards the end of the week.\n\n# ### 5.9.c Project Submission Date Analysis\n\n#%%\n\ntemp = train_data[\"date_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"date_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"date_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission date Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Looks like we have approximately one years' worth of data (May 2016 to April 2017) given in the training set.\n# * There is a sudden spike on a single day (Sep 1, 2016) with respect to the number of proposals (may be some specific reason?)\n\n# ### 5.9.d Project Submission Hour Analysis\n\n#%%\n\ntemp = train_data[\"hour_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"hour_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"hour_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission Hour Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * From Hours 03 to 05, number of proposals decreases.\n# * Hours 06 to 14, number of proposals increases.\n# * At Hour 14 has more number of proposals.\n\n# ## 5.10 Top Keywords in project_essay_1\n\n#%%\n\ntemp_data = train_data.dropna(subset=['project_essay_1'])\n# converting into lowercase\ntemp_data['project_essay_1'] = temp_data['project_essay_1'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['project_essay_1'] = temp_data['project_essay_1'].map(text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['project_essay_1'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Top Keywords in project_essay_1\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.11 Top keywords in project_essay_2\n\n#%%\n\ntemp_data = train_data.dropna(subset=['project_essay_2'])", "target_code": "temp_data['project_essay_2'] = temp_data['project_essay_2'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![](https://static.wixstatic.com/media/80a58d_adc900710a474cd091d5dae9649734f9~mv2.png/v1/fill/w_812,h_353,al_c,lg_1/80a58d_adc900710a474cd091d5dae9649734f9~mv2.png)\n\n# # More To Come. Stay Tuned. !!\n# If there are any suggestions/changes you would like to see in the Kernel please let me know :). Appreciate every ounce of help!\n#\n# **This notebook will always be a work in progress.** Please leave any comments about further improvements to the notebook! Any feedback or constructive criticism is greatly appreciated!. **If you like it or it helps you , you can upvote and/or leave a comment :).**\n\n# - 1. Introduction\n# - 2. Retrieving the Data\n# - 2.1 Load libraries\n# - 2.2 Read the Data\n# - 3. Glimpse of Data\n# - 3.1 Overview of tables\n# - 3.2 Statistical overview of the Data\n# - 4. Data preparation\n# - 4.1 Check for missing data\n# - 5. Data Exploration\n# - 5.1 Project proposal is Approved or not ?\n# - 5.2 Distribution\n# - 5.2.a Distribution of School states\n# - 5.2.b Distribution of project_grade_category (school grade levels (PreK-2, 3-5, 6-8, and 9-12))\n# - 5.2.c Distribution of category of the project\n# - 5.2.d Distribution of number of previously posted applications by the submitting teacher\n# - 5.2.e Distribution of subcategory of the project\n# - 5.2.f Distribution of Project titles\n# - 5.2.g Distribution of price of resource requested\n# - 5.2.h Distribution of quantity of resource requested\n# - 5.2.i Teacher prefix Distribution\n# - 5.3 Top resources needed for the project\n# - 5.4 Word Cloud of resources requested\n# - 5.5 Various popularities in terms of project acceptance rate and project rejection rate\n# - 5.5.a Popular School states in terms of project acceptance rate and project rejection rate\n# - 5.5.b Popular Teacher Prefix in terms of project acceptance rate and project rejection rate\n# - 5.5.c Popular school grade levels in terms of project acceptance rate and project rejection rate\n# - 5.5.d Popular category of the project in terms of project acceptance rate and project rejection rate\n# - 5.5.e Popular subcategory of the project in terms of project acceptance rate and project rejection rate\n# - 5.5.f Popular project titles in terms of project acceptance rate and project rejection rate\n# - 5.6 Project Proposals by US States\n# - 5.7 Project Proposals Mean Acceptance Rate by US States\n# - 5.8 Correlation Matrix and HeatMap of training data\n# - 5.8.a Teacher_prefix and project_is_approved Intervals Correlation\n# - 5.8.b Teacher_number_of_previously_posted_projects and project_is_approved Intervals Correlation\n# - 5.8.c Correlation Matrix and Heatmap of training data\n# - 5.9 Project Submission Time Analysis\n# - 5.9.a Project Submission Month Analysis\n# - 5.9.b Project Submission Weekday Analysis\n# - 5.9.c Project Submission Date Analysis\n# - 5.9.d Project Submission Hour Analysis\n# - 5.10 Top Keywords in project_essay_1\n# - 5.11 Top Keywords in project_essay_2\n# - 5.12 Top Keywords in project_resource_summary\n# - 5.13 Quantity V.S. Price\n# - 5.14 Gender Analysis\n# - 5.15 Month wise distribution of number of projects proposal submitted in each state\n# - 5.16 Price requested for resources distribution [I commented the code becuase of excessive rendering time but i wrote the results]\n# - 5.16.a Price requested for resources distribution by different states\n# - 5.16.b Price requested for resources distribution by Teacher prefixes\n# - 5.16.c Price requested for resources distribution by different Genders\n# - 5.16.d Price requested for resources distribution by different project_grade_category\n# - 5.17 CA(California)\n# - 5.17.a Popularities of Teacher prefixes in California\n# - 5.17.b Popularities of school grade levels in California\n# - 5.17.c Top project titles in California\n# - 5.17.d Trend of project submission time in California\n# - 5.18 TX(Texas)\n# - 5.18.a Popularities of Teacher prefixes in Texas\n# - 5.18.b Popularities of school grade levels in Texas\n# - 5.18.c Top project titles in Texas\n# - 5.18.d Trend of project submission time in Texas\n# - 6. Brief Summary/Conclusion :\n\n# ## 1. Intoduction\n\n# **About DonorsChoose:**\n#\n# DonorsChoose.org is a United States\u2013based 501(c)(3) nonprofit organization that allows individuals to donate directly to public school classroom projects. Founded in 2000 by former public school teacher Charles Best, DonorsChoose.org was among the first civic crowdfunding platforms of its kind. The organization has been given Charity Navigator\u2019s highest rating every year since 2005. In January 2018, they announced that 1 million projects had been funded. In 77% of public schools in the United States, at least one project has been requested on DonorsChoose.org. Schools from wealthy areas are more likely to make technology requests, while schools from less affluent areas are more likely to request basic supplies. It's been noted that repeat donors on DonorsChoose typically donate to projects they have no prior relationship with, and most often fund projects serving financially challenged students.\n#\n#\n# **Objective of this Notebook:**\n#\n# In this Notebook i will do Exploratory Analysis.\n#\n# **Objective of the competition:**\n#\n# DonorsChoose.org receives hundreds of thousands of project proposals each year for classroom projects in need of funding. Right now, a large number of volunteers is needed to manually screen each submission before it's approved to be posted on the DonorsChoose.org website.The goal of the competition is to predict whether or not a DonorsChoose.org project proposal submitted by a teacher will be approved, using the text of project descriptions as well as additional metadata about the project, teacher, and school. DonorsChoose.org can then use this information to identify projects most likely to need further review before approval.\n#\n\n# # 2. Retrieving the Data\n\n# ## 2.1 Load libraries\n\n\nfrom wordcloud import WordCloud\nfrom nltk.corpus import stopwords\nimport re\nfrom subprocess import check_output\nimport warnings\nfrom matplotlib import cm\nfrom numpy import array\nfrom mpl_toolkits.basemap import Basemap\nimport squarify\nimport plotly.tools as tls\nimport plotly.offline as offline\nimport plotly.graph_objs as go\nfrom plotly.offline import init_notebook_mode, iplot\nimport plotly.offline as py\n# package for high-performance, easy-to-use data structures and data analysis\nimport pandas as pd\nimport numpy as np # fundamental package for scientific computing with Python\nimport matplotlib\nimport matplotlib.pyplot as plt # for plotting\nimport seaborn as sns # for making plots with seaborn\ncolor = sns.color_palette()\npy.init_notebook_mode(connected=True)\ninit_notebook_mode(connected=True)\noffline.init_notebook_mode()\n\n# Supress unnecessary warnings so that presentation looks clean\nwarnings.filterwarnings(\"ignore\")\n\n# Print all rows and columns\npd.set_option('display.max_columns', None)\npd.set_option('display.max_rows', None)\n\n\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# ## 2.2 Read tha Data\n\n\ntrain_data = pd.read_csv(\"../input/train.csv\")\ntest_data = pd.read_csv(\"../input/test.csv\")\nresources_data = pd.read_csv(\"../input/resources.csv\")\n\n## Merging with train and test data ##\ntrain_resource = pd.merge(train_data, resources_data, on=\"id\", how='left')\ntest_resource = pd.merge(test_data, resources_data, on=\"id\", how='left')\n\n\nprint(\"Size of training data : \", train_data.shape)\nprint(\"Size of test data : \", test_data.shape)\nprint(\"Size of resource data : \", resources_data.shape)\nprint(\"Size of train_resource data : \", train_resource.shape)\nprint(\"Size of test_resource data : \", test_resource.shape)\n\n\n# # 3. Glimpse of Data\n\n# ## 3.1 Overview of tables\n\n# **Training Data**\n\n\ntrain_data.head()\n\n\n# **Test Data**\n\n\ntest_data.head()\n\n\n# **Resource Data**\n\n\nresources_data.head()\n\n\n# **train_resource**\n\n\ntrain_resource.head()\n\n\n# **test_resource**\n\n\ntest_resource.head()\n\n\n# ## 3.2 Statistical Overview of the Data\n\n# **Training Data some little info**\n\n\ntrain_data.info()\n\n\n# **Little description of training data for numerical features**\n#\n\n\ntrain_data.describe()\n\n\n# **Little description of training data for categorical features**\n#\n\n\ntrain_data.describe(include=[\"O\"])\n\n\n# **Little description of train_resource data for numerical features**\n#\n\n\ntrain_resource.describe()\n\n\n# **Little description of train_resource data for categorical features**\n#\n\n\ntrain_resource.describe(include=[\"O\"])\n\n\n# # 4. Data preparation\n\n# ## 4.1 Checking for missing data\n\n# **Missing data in train_data**\n\n\n# checking missing data in training data\ntotal = train_data.isnull().sum().sort_values(ascending=False)\npercent = (train_data.isnull().sum()/train_data.isnull().count()\n * 100).sort_values(ascending=False)\nmissing_train_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_train_data.head()\n\n\n# * In training data, we can **project_essay_4** and **project_essay_3** having 96 % null values. so during prediction, better remove these 2 columns.\n\n# **Missing data in test_data**\n\n\n# checking missing data in test data\ntotal = test_data.isnull().sum().sort_values(ascending=False)\npercent = (test_data.isnull().sum()/test_data.isnull().count()\n * 100).sort_values(ascending=False)\nmissing_test_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_test_data.head()\n\n\n# * In test data, we can **project_essay_4** and **project_essay_3** having 96 % null values. so during prediction, better remove these 2 columns.\n\n# **Missing data in resources_data**\n\n\n# checking missing data in resource data\ntotal = resources_data.isnull().sum().sort_values(ascending=False)\npercent = (resources_data.isnull().sum() /\n resources_data.isnull().count()*100).sort_values(ascending=False)\nmissing_resources_data = pd.concat(\n [total, percent], axis=1, keys=['Total', 'Percent'])\nmissing_resources_data.head()\n\n\n# * In resource data, only **description** column having few null values. So we can ignore these values.\n\n# # 5. Data Exploration\n\n# ## 5.1 Project proposal is Approved or not ?\n\n\ntemp = train_data['project_is_approved'].value_counts()\nlabels = temp.index\nsizes = (temp / temp.sum())*100\ntrace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')\nlayout = go.Layout(title='Project proposal is approved or not')\ndata = [trace]\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Training data is highly imbalanced that is approx. 85 % projetcs were approved and 15 % project were not approved. Majority imbalanced class is positive.\n\n# ## 5.2 Distribution\n\n# ### 5.2.a Distribution of School states\n\n\ntemp = train_data[\"school_state\"].value_counts()\n#print(\"Total number of states : \",len(temp))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of School states in % \",\n xaxis=dict(\n title='State Name',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 50 states, **California(CA)** having higher number of projects proposal submitted **approx. 14 %** followed by **Texas(TX)(7 %)** and **Tennessee(NY)(7 %)**.\n\n# ### 5.2.b Distribution of project_grade_category (school grade levels (PreK-2, 3-5, 6-8, and 9-12))\n\n\ntemp = train_data[\"project_grade_category\"].value_counts()\nprint(\"Total number of project grade categories : \", len(temp))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of project_grade_category (school grade levels) in %\",\n xaxis=dict(\n title='school grade levels',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in % ',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 4 school grade levels, Project proposals submission in school grade levels is higher for **Grades Prek-2** which is approximately **41 %** followed by **Grades 3-5** which has approx. **34 %**.\n\n# ### 5.2.c Distribution of category of the project\n\n\ntemp = train_data[\"project_subject_categories\"].value_counts().head(10)\nprint(\"Total number of project subject categories : \", len(\n train_data[\"project_subject_categories\"].value_counts()))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of category of the project in %\",\n xaxis=dict(\n title='category of the project',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 51 Project categories, Project proposals submission for project categories is higher for **Literacy & Language** which is approx. **27 %** followed by **Math & Science** which has approx. **20 %**.\n\n# ### 5.2.d Distribution of number of previously posted applications by the submitting teacher\n\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_data['teacher_number_of_previously_posted_projects'])\nplt.xlabel(\n 'number of previously posted applications by the submitting teacher', fontsize=12)\nplt.title(\n \"Histogram of number of previously posted applications by the submitting teacher\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_data.shape[0]), np.sort(\n train_data.teacher_number_of_previously_posted_projects.values))\nplt.xlabel(\n 'number of previously posted applications by the submitting teacher', fontsize=12)\nplt.title(\n \"Distribution of number of previously posted applications by the submitting teacher\")\nplt.show()\n\n\n# ### 5.2.e Distribution of subcategory of the project\n\n\ntemp = train_data[\"project_subject_subcategories\"].value_counts().head(10)\nprint(\"Total sub-categories of the projects : \",\n len(train_data[\"project_subject_subcategories\"]))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of subcategory of the project in %\",\n xaxis=dict(\n title='subcategory of the project',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 1,82,020 Project subcategories, Project proposals submission for project sub-categoriesis is higher for **Literacy** which is approx. **16 % ** followed by **Literacy & Mathematics** which has approx. **16 %** .\n\n# ### 5.2.f Distribution of Project titles\n\n\ntemp = train_data[\"project_title\"].value_counts().head(10)\nprint(\"Total project titles are : \", len(train_data[\"project_title\"]))\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Distribution of Distribution of Project titles in %\",\n xaxis=dict(\n title='Project Title',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig, filename='schoolStateNames')\n\n\n# * Out of 1,82,080 project titles, Project proposals submission for project titles is higher for **Flexible seating** which is approx. **27 %** followed by **Whiggle while your work** which has approx. **14 %**.\n\n# ### 5.2.g Distribution of price of resource requested\n\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_resource['price'])\nplt.xlabel('Price', fontsize=12)\nplt.title(\"Histogran of price of resource requested\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_resource.shape[0]),\n np.sort(train_resource.price.values))\nplt.xlabel('price', fontsize=12)\nplt.title(\"Distribution of price of resource requested\")\nplt.show()\n\n\n# ### 5.2.h Distribution of quantity of resource requested\n\n\nplt.figure(figsize=(12, 8))\n\nsns.distplot(train_resource['price'])\nplt.xlabel('quantity', fontsize=12)\nplt.title(\"Histogran of quantity of resource requested\")\nplt.show()\nplt.figure(figsize=(12, 8))\nplt.scatter(range(train_resource.shape[0]), np.sort(\n train_resource.quantity.values))\nplt.xlabel('price', fontsize=12)\nplt.title(\"Distribution of quantity of resource requested\")\nplt.show()\n\n\n# ### 5.2.i Teacher prefix Distribution\n\n\ntemp = train_data[\"teacher_prefix\"].value_counts()\ntrace = go.Bar(\n x=temp.index,\n y=(temp / temp.sum())*100,\n)\ndata = [trace]\nlayout = go.Layout(\n title=\"Teacher prefix Distribution in %\",\n xaxis=dict(\n title='Teacher prefix',\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n ),\n yaxis=dict(\n title='Count of project proposals submitted in %',\n titlefont=dict(\n size=16,\n color='rgb(107, 107, 107)'\n ),\n tickfont=dict(\n size=14,\n color='rgb(107, 107, 107)'\n )\n )\n)\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Higher number of project proposal submitted by **married womens** which is approx. **53 %** followed by **unmarried womens** which has approx. **37 %**.\n# * Project proposal submitted by **Teacher** which is approx. **2 %** is vey low as compared to **Mrs., Ms., Mr**.\n\n# ## 5.3 Top resources needed for the project\n\n\nREPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\nBAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\nSTOPWORDS = set(stopwords.words('english'))\n\n\ndef text_prepare(text):\n \"\"\"\n text: a string\n\n return: modified initial string\n \"\"\"\n text = text.lower() # lowercase text\n # replace REPLACE_BY_SPACE_RE symbols by space in text\n text = REPLACE_BY_SPACE_RE.sub(' ', text)\n # delete symbols which are in BAD_SYMBOLS_RE from text\n text = BAD_SYMBOLS_RE.sub('', text)\n # delete stopwords from text\n temp = [s.strip() for s in text.split() if s not in STOPWORDS]\n new_text = ''\n for i in temp:\n new_text += i+' '\n text = new_text\n return text.strip()\n\n\ntemp_data = train_data.dropna(subset=['project_resource_summary'])\n# converting into lowercase\ntemp_data['project_resource_summary'] = temp_data['project_resource_summary'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['project_resource_summary'] = temp_data['project_resource_summary'].map(\n text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['project_resource_summary'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Top resources needed for the project\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.4 Word Cloud of resources requested\n\n\ntemp_data = train_resource.dropna(subset=['description'])\n# converting into lowercase\ntemp_data['description'] = temp_data['description'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['description'] = temp_data['description'].map(text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['description'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Word Cloud of resources requested\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.5 Various popularities in terms of project acceptance rate and project rejection rate\n\n# ### 5.5.a Popular School states in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"school_state\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"school_state\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"school_state\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular School states in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.b Popular Teacher Prefix in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"teacher_prefix\"].value_counts()\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"teacher_prefix\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"teacher_prefix\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular Teacher prefixes in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.c Popular school grade levels in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"project_grade_category\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_grade_category\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_grade_category\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular school grade levels in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.d Popular category of the project in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"project_subject_categories\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_categories\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_categories\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular category of the project in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.e Popular subcategory of the project in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"project_subject_subcategories\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_subcategories\"] == val] == 1))\n temp_y0.append(np.sum(train_data[\"project_is_approved\"]\n [train_data[\"project_subject_subcategories\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular subcategory of the project in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ### 5.5.f Popular project titles in terms of project acceptance rate and project rejection rate\n\n\ntemp = train_data[\"project_title\"].value_counts().head(20)\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"project_title\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"project_title\"] == val] == 0))\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Popular project titles in terms of project acceptance rate and project rejection rate\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# ## 5.6 Project Proposals by US States\n\n\ntemp = pd.DataFrame(train_data[\"school_state\"].value_counts()).reset_index()\ntemp.columns = ['state_code', 'num_proposals']\n\ndata = [dict(\n type='choropleth',\n locations=temp['state_code'],\n locationmode='USA-states',\n z=temp['num_proposals'].astype(float),\n text=temp['state_code'],\n colorscale='Red',\n marker=dict(line=dict(width=0.7)),\n colorbar=dict(autotick=False, tickprefix='',\n title='Number of project proposals'),\n )]\nlayout = dict(title='Project Proposals by US States', geo=dict(\n scope='usa',\n projection=dict(type='albers usa'),\n showlakes=True,\n lakecolor='rgb(255, 255, 255)'),\n)\nfig = dict(data=data, layout=layout)\npy.iplot(fig, validate=False)\n\n\n# ## 5.7 Project Proposals Mean Acceptance Rate by US States\n\n\ntemp = pd.DataFrame(train_data.groupby(\"school_state\")[\n \"project_is_approved\"].apply(np.mean)).reset_index()\ntemp.columns = ['state_code', 'num_proposals']\n\ndata = [dict(\n type='choropleth',\n locations=temp['state_code'],\n locationmode='USA-states',\n z=temp['num_proposals'].astype(float),\n text=temp['state_code'],\n colorscale='Red',\n marker=dict(line=dict(width=0.7)),\n colorbar=dict(autotick=False, tickprefix='',\n title='Number of project proposals'),\n )]\nlayout = dict(title='Project Proposals Mean Acceptance Rate by US States', geo=dict(\n scope='usa',\n projection=dict(type='albers usa'),\n showlakes=True,\n lakecolor='rgb(255, 255, 255)'),\n)\nfig = dict(data=data, layout=layout)\npy.iplot(fig, validate=False)\n\n\n# ## 5.8 Correlation Matrix and HeatMap of training data\n\n# ### 5.8.a Teacher_prefix and project_is_approved Intervals Correlation\n\n\ncols = ['teacher_prefix', 'project_is_approved']\ncm = sns.light_palette(\"red\", as_cmap=True)\npd.crosstab(train_data[cols[0]], train_data[cols[1]]\n ).style.background_gradient(cmap=cm)\n\n\n# ### 5.8.b Teacher_number_of_previously_posted_projects and project_is_approved Intervals Correlation\n\n\ncols = ['teacher_number_of_previously_posted_projects', 'project_is_approved']\ncm = sns.light_palette(\"red\", as_cmap=True)\npd.crosstab(train_data[cols[0]], train_data[cols[1]]\n ).style.background_gradient(cmap=cm)\n\n\n# * Number of previously posted applications by the submitting teacher was** Zero(0)** having more number of acceptance rate.\n\n# ### 5.8.c Correlation Matrix and Heatmap of training data\n\n\n# Correlation Matrix\ncorr = train_data.corr()\nplt.figure(figsize=(12, 12))\nsns.heatmap(corr,\n xticklabels=corr.columns.values,\n yticklabels=corr.columns.values, annot=True, cmap='cubehelix', square=True)\nplt.title('Correlation between different features')\ncorr\n\n\n# ## 5.9 Project Submission Time Analysis\n\n\ntrain_data[\"project_submitted_datetime\"] = pd.to_datetime(\n train_data[\"project_submitted_datetime\"])\ntrain_data[\"month_created\"] = train_data[\"project_submitted_datetime\"].dt.month\ntrain_data[\"weekday_created\"] = train_data[\"project_submitted_datetime\"].dt.weekday\ntrain_data[\"date_created\"] = train_data[\"project_submitted_datetime\"].dt.date\ntrain_data[\"hour_created\"] = train_data[\"project_submitted_datetime\"].dt.hour\n\n\n# ### 5.9.a Project Submission Month Analysis\n\n\ntemp = train_data[\"month_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"month_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"month_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission Month Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * **August month** has the second number of proposals followed by **September month** .\n\n# ### 5.9.b Project Submission Weekday Analysis\n\n\ntemp = train_data[\"weekday_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"weekday_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"weekday_created\"] == val] == 0))\n\ntemp.index = [\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\"]\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission weekday Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * The number of proposals decreases as we move towards the end of the week.\n\n# ### 5.9.c Project Submission Date Analysis\n\n\ntemp = train_data[\"date_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"date_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"date_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission date Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * Looks like we have approximately one years' worth of data (May 2016 to April 2017) given in the training set.\n# * There is a sudden spike on a single day (Sep 1, 2016) with respect to the number of proposals (may be some specific reason?)\n\n# ### 5.9.d Project Submission Hour Analysis\n\n\ntemp = train_data[\"hour_created\"].value_counts()\n# print(temp.values)\ntemp_y0 = []\ntemp_y1 = []\nfor val in temp.index:\n temp_y1.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"hour_created\"] == val] == 1))\n temp_y0.append(np.sum(\n train_data[\"project_is_approved\"][train_data[\"hour_created\"] == val] == 0))\n\ntrace1 = go.Bar(\n x=temp.index,\n y=temp_y1,\n name='Accepted Proposals'\n)\ntrace2 = go.Bar(\n x=temp.index,\n y=temp_y0,\n name='Rejected Proposals'\n)\n\ndata = [trace1, trace2]\nlayout = go.Layout(\n title=\"Project Proposal Submission Hour Distribution\",\n barmode='stack',\n width=1000\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)\n\n\n# * From Hours 03 to 05, number of proposals decreases.\n# * Hours 06 to 14, number of proposals increases.\n# * At Hour 14 has more number of proposals.\n\n# ## 5.10 Top Keywords in project_essay_1\n\n\ntemp_data = train_data.dropna(subset=['project_essay_1'])\n# converting into lowercase\ntemp_data['project_essay_1'] = temp_data['project_essay_1'].apply(\n lambda x: \" \".join(x.lower() for x in x.split()))\ntemp_data['project_essay_1'] = temp_data['project_essay_1'].map(text_prepare)\n\n\nwordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(\n ' '.join(temp_data['project_essay_1'].values))\nplt.figure(figsize=(15, 8))\nplt.imshow(wordcloud)\nplt.title(\"Top Keywords in project_essay_1\", fontsize=35)\nplt.axis(\"off\")\nplt.show()\n\n\n# ## 5.11 Top keywords in project_essay_2\n\n\ntemp_data = train_data.dropna(subset=['project_essay_2'])\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# converting 'project_essay_2' into lowercase"}, {"original_comment": " # Training and Testing Data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.datasets import load_breast_cancer\nimport numpy as np\nimport idx2numpy\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.datasets import load_iris\nfrom sklearn import tree\nfrom sklearn import datasets\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport time\nfrom micromlgen import port\nimport warnings\nwarnings.filterwarnings('ignore')\n\n# Classifiers Load\n\n\n# ### 1. Train using Iris Flowers dataset and generate C version of DT and RF Classifiers\n\n#%%\n\n# Iris Datasets load and train DT and RF\n\niris = datasets.load_iris()\nx = iris.data\ny = iris.target\nd = [{\"sepal_length\": row[0],\n \"sepal_width\":row[1],\n \"petal_length\":row[2],\n \"petal_width\":row[3]} for row in x]\ndf = pd.DataFrame(d) # construct dataframe\ndf[\"types\"] = y # assign types\ndf = df.sample(frac=1.0) # random shuffle rows\ndf.head()\n# train test split, ratio = 0.8\nfeatures = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\ntypes = df[\"types\"]\ntrain_features, test_features, train_types, test_types = train_test_split(\n features, types, train_size=0.8, random_state=1)\ntrain_features_x, test_features_x, train_types_x, test_types_x = train_test_split(\n features, types, train_size=0.998, random_state=1)\ntrain_features_x, test_features_100x, train_types_x, test_types_x = train_test_split(\n features, types, train_size=0.334, random_state=1)\n\n#%%\n\nprint(test_features_x.shape)\nprint(test_features_100x.shape)\n\n\n# ### 1.1 Iris Decision Tree\n\n#%%\n\n# Iris DT\nclf_DT = tree.DecisionTreeClassifier()\nclf_DT = clf_DT.fit(train_features, train_types)\n\n\nstart = time.time()\nprediction = clf_DT.predict(test_features_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = clf_DT.predict(test_features_100x)\nstop = time.time()\nprint(f\"Inference time for 100 samples: {stop - start}s\")\n\nprediction = clf_DT.predict(test_features)\n# Copy paste the port() output in DT_iris.h\nprint(classification_report(test_types, prediction))\nprint(port(clf_DT))\n\nf = open(\"./Trained_classifiers/DT_iris.h\", \"w\")\nf.write(port(clf_DT))\nf.close()\n\n\n# ### 1.2 Iris Random Forest\n\n#%%\n\n# Iris RF\nclf_RF = RandomForestClassifier()\nclf_RF.fit(train_features, train_types)\n\nstart = time.time()\nprediction = clf_RF.predict(test_features_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = clf_RF.predict(test_features_100x)\nstop = time.time()\nprint(f\"Inference time for 100 samples: {stop - start}s\")\n\nprediction = clf_RF.predict(test_features)\n\n# Copy paste the port() output in RF_iris.h\nprint(classification_report(test_types, prediction,\n target_names=[\"type0\", \"type1\", \"type2\"]))\nprint(port(clf_DT))\n\nf = open(\"./Trained_classifiers/RF_iris.h\", \"w\")\nf.write(port(clf_RF))\nf.close()\n\n\n# ## 2. Train using Heart Disease dataset and generate C version of DT and RF Classifiers\n\n#%%\n\ndataset = pd.read_csv('dataset.csv')\n# dataset = pd.get_dummies(dataset, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])\nstandardScaler = StandardScaler()\ncolumns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']\n# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])\ny = dataset['target']\nX = dataset.drop(['target'], axis=1)\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.20, random_state=0)\nX_train_x, X_test_x, y_train_x, y_test_x = train_test_split(\n X, y, test_size=0.001, random_state=0)\nX_train_x, X_test_100x, y_train_x, y_test_x = train_test_split(\n X, y, test_size=0.33, random_state=0)\n\n#%%\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 2.1 Heart Disease DT\n\n#%%\n\ndt_classifier = tree.DecisionTreeClassifier(max_features=4, random_state=0)\ndt_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = dt_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = dt_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = dt_classifier.predict(X_test)\n\n# Copy paste the port() output in DT_Heart.h\nprint(classification_report(y_test, prediction))\nprint(port(dt_classifier))\n\nf = open(\"./Trained_classifiers/DT_Heart.h\", \"w\")\nf.write(port(dt_classifier))\nf.close()\n\n\n# ### 2.2 Heart Disease RF\n\n#%%\n\nrf_classifier = RandomForestClassifier(n_estimators=10, random_state=0)\n# n_estimators = 500, random_state = 0\nrf_classifier.fit(X_train, y_train)\n\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Heart.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Heart.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 3. Train using Breast Cancer dataset and generate C version of DT and RF Classifiers\n\n#%%\n\ncancer = load_breast_cancer()\nX = cancer.data\ny = cancer.target\n\n\nX_train, X_test, y_train, y_test = train_test_split(X,\n y,\n train_size=0.8,\n random_state=0)\nX_train_xx, X_test_x, y_train_xx, y_test_xx = train_test_split(X,\n y,\n train_size=0.999,\n random_state=0)\nX_train_xx, X_test_100x, y_train_xx, y_test_xx = train_test_split(X,\n y,\n train_size=0.825,\n random_state=0)\n\n#%%\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 3.1 Breast Cancer DT\n\n#%%\n\nB_tree = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nB_tree.fit(X_train, y_train)\n\nstart = time.time()\nprediction = B_tree.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = B_tree.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = B_tree.predict(X_test)\n\n# Copy paste the port() output in DT_Cancer.h\nprint(classification_report(y_test, prediction))\nprint(port(B_tree))\n\nf = open(\"./Trained_classifiers/DT_Cancer.h\", \"w\")\nf.write(port(B_tree))\nf.close()\n\n\n# ### 3.2 Breast Cancer RF\n\n#%%\n\nrf_classifier = RandomForestClassifier(n_estimators=10, random_state=0)\nrf_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Cancer.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Cancer.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 4. Train using Handwritten Digits dataset and generate C version of DT and RF Classifiers\n\n#%%\n\nX_train_3D = idx2numpy.convert_from_file('train-images.idx3-ubyte')\nX_train = X_train_3D.flatten().reshape(60000, 784)\n\ny_train = idx2numpy.convert_from_file('train-labels.idx1-ubyte')\n\nX_test_3D = idx2numpy.convert_from_file('t10k-images.idx3-ubyte')\nX_test = X_test_3D.flatten().reshape(10000, 784)\n\ny_test = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte')\n\n\nX_train_xx, X_test_x, y_train_xx, y_test_xx = train_test_split(X_train,\n y_train,\n train_size=0.999999999,\n random_state=0)\nX_train_xx, X_test_100x, y_train_xx, y_test_xx = train_test_split(X_train,\n y_train,\n train_size=0.99834,\n random_state=0)\n\n#%%\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 4.1 Handwritten Digits DT\n\n#%%\n\ndt_classifier_digits = tree.DecisionTreeClassifier(max_depth=10)\ndt_classifier_digits.fit(X_train, y_train)\n\nstart = time.time()\nprediction = dt_classifier_digits.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = dt_classifier_digits.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = dt_classifier_digits.predict(X_test)\n\n# Copy paste the port() output in DT_digits.h\nprint(classification_report(y_test, prediction))\nprint(port(dt_classifier_digits))\n\nf = open(\"./Trained_classifiers/DT_digits.h\", \"w\")\nf.write(port(dt_classifier_digits))\nf.close()\n\n\n# ### 4.2 Handwritten Digits RF\n\n#%%\n\nrf_classifier = RandomForestClassifier(n_estimators=1, random_state=0)\nrf_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Digits.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Digits.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 5. Train using Banknote Authentication dataset and generate C version of DT and RF Classifiers\n\n#%%\n\n# reading the data\ndata = pd.read_csv('./bank_notes.csv')\ndata.head()\n\n# missing value counts in each of these columns\nmiss = data.isnull().sum()/len(data)\nmiss = miss[miss > 0]\nmiss.sort_values(inplace=True)\nmiss\n\nX = data[[\"variance\", \"skewness\", \"curtosis\", \"entropy\"]]\ny = data[\"Target\"]\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=42)\nX_train_1, X_test_x, y_train_1, y_test_1 = train_test_split(\n X, y, test_size=0.0001, random_state=42)\nX_train_1, X_test_100x, y_train_1, y_test_1 = train_test_split(\n X, y, test_size=0.0723, random_state=42)\n\n#%%\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 5.1 Banknote DT\n\n#%%\n\nBank_DT = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nBank_DT.fit(X_train, y_train)\n\nstart = time.time()\nprediction = Bank_DT.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = Bank_DT.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = Bank_DT.predict(X_test)\n\n# Copy paste the port() output in DT_Banknote.h\nprint(classification_report(y_test, prediction))\nprint(port(Bank_DT))\n\nf = open(\"./Trained_classifiers/DT_Banknote.h\", \"w\")\nf.write(port(Bank_DT))\nf.close()\n\n\n# ### 5.2 Banknote RF\n\n#%%\n\nrndF = RandomForestClassifier(max_depth=5, random_state=0)\nrndF.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rndF.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rndF.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nRndF_pred = rndF.predict(X_test)\nprint(classification_report(y_test, RndF_pred))\nprint(port(rndF))\n\n# Copy paste the port() output in RF_Banknote.h\nf = open(\"./Trained_classifiers/RF_Banknote.h\", \"w\")\nf.write(port(rndF))\nf.close()\n\n\n# ## 6. Train using Haberman\u2019s Survival dataset and generate C version of DT and RF Classifiers\n\n#%%\n\nurl = \"haberman.data\"\nnames = ['Age', 'Year operation', 'Axillary nodes detected', 'Survival status']\ndataset = pd.read_csv(url, names=names)\narray = dataset.values\nX = array[:, :3]\ny = array[:, 3]\nrandom_state = 4\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n random_state=random_state)\n_, X_test_x, _, _ = train_test_split(X, y, test_size=0.001,\n random_state=random_state)\n_, X_test_100x, _, _ = train_test_split(X, y, test_size=0.324,\n random_state=random_state)\n\n#%%\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 6.1 Haberman\u2019s Survival DT\n\n#%%\n\nDT_clf = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nDT_clf.fit(X_train, y_train)\n\nstart = time.time()\nprediction = DT_clf.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\n\nstart = time.time()\nprediction = DT_clf.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\n# Copy paste the port() output in DT_Survival.h\nDT_clf_pred = DT_clf.predict(X_test)\nprint(classification_report(y_test, DT_clf_pred))\nprint(port(DT_clf))\n\nf = open(\"./Trained_classifiers/DT_Survival.h\", \"w\")\nf.write(port(DT_clf))\nf.close()\n\n\n# ### 6.2 Haberman\u2019s Survival RF\n\n#%%\n\nHab_rndF = RandomForestClassifier(max_depth=5, random_state=0)\nHab_rndF.fit(X_train, y_train)\n\nstart = time.time()\nprediction = Hab_rndF.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = Hab_rndF.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\n# Copy paste the port() output in RF_Survival.h\nHab_RndF_pred = Hab_rndF.predict(X_test)\nprint(classification_report(y_test, Hab_RndF_pred))\nprint(port(Hab_rndF))\n\nf = open(\"./Trained_classifiers/RF_Survival.h\", \"w\")\nf.write(port(Hab_rndF))\nf.close()\n\n\n# ## 7. Train using Titanic dataset and generate C version of DT and RF Classifiers\n\n#%%\n\ndef setAgeBoundaries():\n for dataset in combine:\n dataset.loc[dataset['Age'] <= 5, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 5) & (dataset['Age'] <= 16), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 3\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 4\n dataset.loc[dataset['Age'] > 64, 'Age'] = 5\n\n\ndef normalizeFamily():\n for dataset in combine:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n\n\ndef pivotingData(data, entry1, entry2, groupBy, sortBy):\n return data[[entry1, entry2]].groupby([groupBy], as_index=False).mean().sort_values(by=sortBy, ascending=False)\n\n\ndef printPivotedData(data):\n # only categorical values\n print(pivotingData(data, 'Pclass', 'Survived', 'Pclass', 'Survived'))\n print(pivotingData(data, 'Sex', 'Survived', 'Sex', 'Survived'))\n print(pivotingData(data, 'SibSp', 'Survived', 'SibSp', 'Survived'))\n print(pivotingData(data, 'Parch', 'Survived', 'Parch', 'Survived'))\n\n\ndef normalizeSex():\n for dataset in combine:\n dataset['Sex'] = dataset['Sex'].map(\n {'female': 1, 'male': 0}).astype(int)\n\n\ndef normalizeAges():\n guess_ages = np.zeros((2, 3))\n for dataset in combine:\n for i in range(0, 2):\n for j in range(0, 3):\n guess_df = dataset[(dataset['Sex'] == i) & (\n dataset['Pclass'] == j+1)]['Age'].dropna()\n age_guess = guess_df.median()\n guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5\n\n for i in range(0, 2):\n for j in range(0, 3):\n dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (\n dataset.Pclass == j+1), 'Age'] = guess_ages[i, j]\n\n dataset['Age'] = dataset['Age'].astype(int)\n\n\ndef normalizeEmbarked():\n freq_port = train_df.Embarked.dropna().mode()[0]\n\n for dataset in combine:\n dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)\n\n for dataset in combine:\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n\ndef normalizeFare():\n\n for dataset in combine:\n dataset.loc[(dataset['Fare'] < 9), 'Fare'] = 0\n dataset.loc[(dataset['Fare'] >= 9) & (\n dataset['Fare'] < 12), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] >= 12) & (\n dataset['Fare'] < 15), 'Fare'] = 2\n dataset.loc[(dataset['Fare'] >= 15) & (\n dataset['Fare'] < 20), 'Fare'] = 3\n dataset.loc[(dataset['Fare'] >= 20) & (\n dataset['Fare'] < 30), 'Fare'] = 4\n dataset.loc[(dataset['Fare'] >= 30) & (\n dataset['Fare'] < 55), 'Fare'] = 5\n dataset.loc[(dataset['Fare'] >= 55) & (\n dataset['Fare'] < 95), 'Fare'] = 6\n dataset.loc[(dataset['Fare'] >= 95), 'Fare'] = 7\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n\ndef normalizeAgeClass():\n for dataset in combine:\n dataset['Age*Class*Fare'] = dataset.Age * dataset.Pclass * dataset.Fare\n dataset['Age*Class'] = dataset.Age * dataset.Pclass\n dataset['Age*Fare'] = dataset.Age * dataset.Fare\n\n\ndef normalizeData():\n normalizeSex()\n normalizeAges()\n setAgeBoundaries()\n normalizeFamily()\n normalizeEmbarked()\n normalizeFare()\n normalizeAgeClass()\n\n\ndef getFareClass(data, cat):\n return data.loc[data['Fare'] == cat]\n\n\ndef main():\n global train_df\n global test_df\n global combine", "target_code": "train_df = pd.read_csv('train_titanic.csv')\ntest_df = pd.read_csv('test_titanic.csv')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.datasets import load_breast_cancer\nimport numpy as np\nimport idx2numpy\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.datasets import load_iris\nfrom sklearn import tree\nfrom sklearn import datasets\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport time\nfrom micromlgen import port\nimport warnings\nwarnings.filterwarnings('ignore')\n\n# Classifiers Load\n\n\n# ### 1. Train using Iris Flowers dataset and generate C version of DT and RF Classifiers\n\n\n# Iris Datasets load and train DT and RF\n\niris = datasets.load_iris()\nx = iris.data\ny = iris.target\nd = [{\"sepal_length\": row[0],\n \"sepal_width\":row[1],\n \"petal_length\":row[2],\n \"petal_width\":row[3]} for row in x]\ndf = pd.DataFrame(d) # construct dataframe\ndf[\"types\"] = y # assign types\ndf = df.sample(frac=1.0) # random shuffle rows\ndf.head()\n# train test split, ratio = 0.8\nfeatures = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\ntypes = df[\"types\"]\ntrain_features, test_features, train_types, test_types = train_test_split(\n features, types, train_size=0.8, random_state=1)\ntrain_features_x, test_features_x, train_types_x, test_types_x = train_test_split(\n features, types, train_size=0.998, random_state=1)\ntrain_features_x, test_features_100x, train_types_x, test_types_x = train_test_split(\n features, types, train_size=0.334, random_state=1)\n\n\nprint(test_features_x.shape)\nprint(test_features_100x.shape)\n\n\n# ### 1.1 Iris Decision Tree\n\n\n# Iris DT\nclf_DT = tree.DecisionTreeClassifier()\nclf_DT = clf_DT.fit(train_features, train_types)\n\n\nstart = time.time()\nprediction = clf_DT.predict(test_features_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = clf_DT.predict(test_features_100x)\nstop = time.time()\nprint(f\"Inference time for 100 samples: {stop - start}s\")\n\nprediction = clf_DT.predict(test_features)\n# Copy paste the port() output in DT_iris.h\nprint(classification_report(test_types, prediction))\nprint(port(clf_DT))\n\nf = open(\"./Trained_classifiers/DT_iris.h\", \"w\")\nf.write(port(clf_DT))\nf.close()\n\n\n# ### 1.2 Iris Random Forest\n\n\n# Iris RF\nclf_RF = RandomForestClassifier()\nclf_RF.fit(train_features, train_types)\n\nstart = time.time()\nprediction = clf_RF.predict(test_features_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = clf_RF.predict(test_features_100x)\nstop = time.time()\nprint(f\"Inference time for 100 samples: {stop - start}s\")\n\nprediction = clf_RF.predict(test_features)\n\n# Copy paste the port() output in RF_iris.h\nprint(classification_report(test_types, prediction,\n target_names=[\"type0\", \"type1\", \"type2\"]))\nprint(port(clf_DT))\n\nf = open(\"./Trained_classifiers/RF_iris.h\", \"w\")\nf.write(port(clf_RF))\nf.close()\n\n\n# ## 2. Train using Heart Disease dataset and generate C version of DT and RF Classifiers\n\n\ndataset = pd.read_csv('dataset.csv')\n# dataset = pd.get_dummies(dataset, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])\nstandardScaler = StandardScaler()\ncolumns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']\n# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])\ny = dataset['target']\nX = dataset.drop(['target'], axis=1)\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.20, random_state=0)\nX_train_x, X_test_x, y_train_x, y_test_x = train_test_split(\n X, y, test_size=0.001, random_state=0)\nX_train_x, X_test_100x, y_train_x, y_test_x = train_test_split(\n X, y, test_size=0.33, random_state=0)\n\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 2.1 Heart Disease DT\n\n\ndt_classifier = tree.DecisionTreeClassifier(max_features=4, random_state=0)\ndt_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = dt_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = dt_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = dt_classifier.predict(X_test)\n\n# Copy paste the port() output in DT_Heart.h\nprint(classification_report(y_test, prediction))\nprint(port(dt_classifier))\n\nf = open(\"./Trained_classifiers/DT_Heart.h\", \"w\")\nf.write(port(dt_classifier))\nf.close()\n\n\n# ### 2.2 Heart Disease RF\n\n\nrf_classifier = RandomForestClassifier(n_estimators=10, random_state=0)\n# n_estimators = 500, random_state = 0\nrf_classifier.fit(X_train, y_train)\n\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Heart.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Heart.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 3. Train using Breast Cancer dataset and generate C version of DT and RF Classifiers\n\n\ncancer = load_breast_cancer()\nX = cancer.data\ny = cancer.target\n\n\nX_train, X_test, y_train, y_test = train_test_split(X,\n y,\n train_size=0.8,\n random_state=0)\nX_train_xx, X_test_x, y_train_xx, y_test_xx = train_test_split(X,\n y,\n train_size=0.999,\n random_state=0)\nX_train_xx, X_test_100x, y_train_xx, y_test_xx = train_test_split(X,\n y,\n train_size=0.825,\n random_state=0)\n\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 3.1 Breast Cancer DT\n\n\nB_tree = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nB_tree.fit(X_train, y_train)\n\nstart = time.time()\nprediction = B_tree.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = B_tree.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = B_tree.predict(X_test)\n\n# Copy paste the port() output in DT_Cancer.h\nprint(classification_report(y_test, prediction))\nprint(port(B_tree))\n\nf = open(\"./Trained_classifiers/DT_Cancer.h\", \"w\")\nf.write(port(B_tree))\nf.close()\n\n\n# ### 3.2 Breast Cancer RF\n\n\nrf_classifier = RandomForestClassifier(n_estimators=10, random_state=0)\nrf_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Cancer.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Cancer.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 4. Train using Handwritten Digits dataset and generate C version of DT and RF Classifiers\n\n\nX_train_3D = idx2numpy.convert_from_file('train-images.idx3-ubyte')\nX_train = X_train_3D.flatten().reshape(60000, 784)\n\ny_train = idx2numpy.convert_from_file('train-labels.idx1-ubyte')\n\nX_test_3D = idx2numpy.convert_from_file('t10k-images.idx3-ubyte')\nX_test = X_test_3D.flatten().reshape(10000, 784)\n\ny_test = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte')\n\n\nX_train_xx, X_test_x, y_train_xx, y_test_xx = train_test_split(X_train,\n y_train,\n train_size=0.999999999,\n random_state=0)\nX_train_xx, X_test_100x, y_train_xx, y_test_xx = train_test_split(X_train,\n y_train,\n train_size=0.99834,\n random_state=0)\n\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 4.1 Handwritten Digits DT\n\n\ndt_classifier_digits = tree.DecisionTreeClassifier(max_depth=10)\ndt_classifier_digits.fit(X_train, y_train)\n\nstart = time.time()\nprediction = dt_classifier_digits.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = dt_classifier_digits.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = dt_classifier_digits.predict(X_test)\n\n# Copy paste the port() output in DT_digits.h\nprint(classification_report(y_test, prediction))\nprint(port(dt_classifier_digits))\n\nf = open(\"./Trained_classifiers/DT_digits.h\", \"w\")\nf.write(port(dt_classifier_digits))\nf.close()\n\n\n# ### 4.2 Handwritten Digits RF\n\n\nrf_classifier = RandomForestClassifier(n_estimators=1, random_state=0)\nrf_classifier.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rf_classifier.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = rf_classifier.predict(X_test)\n\n# Copy paste the port() output in RF_Digits.h\nprint(classification_report(y_test, prediction))\nprint(port(rf_classifier))\n\nf = open(\"./Trained_classifiers/RF_Digits.h\", \"w\")\nf.write(port(rf_classifier))\nf.close()\n\n\n# ## 5. Train using Banknote Authentication dataset and generate C version of DT and RF Classifiers\n\n\n# reading the data\ndata = pd.read_csv('./bank_notes.csv')\ndata.head()\n\n# missing value counts in each of these columns\nmiss = data.isnull().sum()/len(data)\nmiss = miss[miss > 0]\nmiss.sort_values(inplace=True)\nmiss\n\nX = data[[\"variance\", \"skewness\", \"curtosis\", \"entropy\"]]\ny = data[\"Target\"]\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=42)\nX_train_1, X_test_x, y_train_1, y_test_1 = train_test_split(\n X, y, test_size=0.0001, random_state=42)\nX_train_1, X_test_100x, y_train_1, y_test_1 = train_test_split(\n X, y, test_size=0.0723, random_state=42)\n\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 5.1 Banknote DT\n\n\nBank_DT = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nBank_DT.fit(X_train, y_train)\n\nstart = time.time()\nprediction = Bank_DT.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = Bank_DT.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nprediction = Bank_DT.predict(X_test)\n\n# Copy paste the port() output in DT_Banknote.h\nprint(classification_report(y_test, prediction))\nprint(port(Bank_DT))\n\nf = open(\"./Trained_classifiers/DT_Banknote.h\", \"w\")\nf.write(port(Bank_DT))\nf.close()\n\n\n# ### 5.2 Banknote RF\n\n\nrndF = RandomForestClassifier(max_depth=5, random_state=0)\nrndF.fit(X_train, y_train)\n\nstart = time.time()\nprediction = rndF.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = rndF.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\nRndF_pred = rndF.predict(X_test)\nprint(classification_report(y_test, RndF_pred))\nprint(port(rndF))\n\n# Copy paste the port() output in RF_Banknote.h\nf = open(\"./Trained_classifiers/RF_Banknote.h\", \"w\")\nf.write(port(rndF))\nf.close()\n\n\n# ## 6. Train using Haberman\u2019s Survival dataset and generate C version of DT and RF Classifiers\n\n\nurl = \"haberman.data\"\nnames = ['Age', 'Year operation', 'Axillary nodes detected', 'Survival status']\ndataset = pd.read_csv(url, names=names)\narray = dataset.values\nX = array[:, :3]\ny = array[:, 3]\nrandom_state = 4\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n random_state=random_state)\n_, X_test_x, _, _ = train_test_split(X, y, test_size=0.001,\n random_state=random_state)\n_, X_test_100x, _, _ = train_test_split(X, y, test_size=0.324,\n random_state=random_state)\n\n\nprint(X_test_x.shape)\nprint(X_test_100x.shape)\n\n\n# ### 6.1 Haberman\u2019s Survival DT\n\n\nDT_clf = tree.DecisionTreeClassifier(criterion='entropy',\n max_depth=3,\n random_state=0)\nDT_clf.fit(X_train, y_train)\n\nstart = time.time()\nprediction = DT_clf.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\n\nstart = time.time()\nprediction = DT_clf.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\n# Copy paste the port() output in DT_Survival.h\nDT_clf_pred = DT_clf.predict(X_test)\nprint(classification_report(y_test, DT_clf_pred))\nprint(port(DT_clf))\n\nf = open(\"./Trained_classifiers/DT_Survival.h\", \"w\")\nf.write(port(DT_clf))\nf.close()\n\n\n# ### 6.2 Haberman\u2019s Survival RF\n\n\nHab_rndF = RandomForestClassifier(max_depth=5, random_state=0)\nHab_rndF.fit(X_train, y_train)\n\nstart = time.time()\nprediction = Hab_rndF.predict(X_test_x)\nstop = time.time()\nprint(f\"Unit Inference time: {stop - start}s\")\n\nstart = time.time()\nprediction = Hab_rndF.predict(X_test_100x)\nstop = time.time()\nprint(f\"Time to infer for 100 samples: {stop - start}s\")\n\n# Copy paste the port() output in RF_Survival.h\nHab_RndF_pred = Hab_rndF.predict(X_test)\nprint(classification_report(y_test, Hab_RndF_pred))\nprint(port(Hab_rndF))\n\nf = open(\"./Trained_classifiers/RF_Survival.h\", \"w\")\nf.write(port(Hab_rndF))\nf.close()\n\n\n# ## 7. Train using Titanic dataset and generate C version of DT and RF Classifiers\n\n\ndef setAgeBoundaries():\n for dataset in combine:\n dataset.loc[dataset['Age'] <= 5, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 5) & (dataset['Age'] <= 16), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 3\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 4\n dataset.loc[dataset['Age'] > 64, 'Age'] = 5\n\n\ndef normalizeFamily():\n for dataset in combine:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n\n\ndef pivotingData(data, entry1, entry2, groupBy, sortBy):\n return data[[entry1, entry2]].groupby([groupBy], as_index=False).mean().sort_values(by=sortBy, ascending=False)\n\n\ndef printPivotedData(data):\n # only categorical values\n print(pivotingData(data, 'Pclass', 'Survived', 'Pclass', 'Survived'))\n print(pivotingData(data, 'Sex', 'Survived', 'Sex', 'Survived'))\n print(pivotingData(data, 'SibSp', 'Survived', 'SibSp', 'Survived'))\n print(pivotingData(data, 'Parch', 'Survived', 'Parch', 'Survived'))\n\n\ndef normalizeSex():\n for dataset in combine:\n dataset['Sex'] = dataset['Sex'].map(\n {'female': 1, 'male': 0}).astype(int)\n\n\ndef normalizeAges():\n guess_ages = np.zeros((2, 3))\n for dataset in combine:\n for i in range(0, 2):\n for j in range(0, 3):\n guess_df = dataset[(dataset['Sex'] == i) & (\n dataset['Pclass'] == j+1)]['Age'].dropna()\n age_guess = guess_df.median()\n guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5\n\n for i in range(0, 2):\n for j in range(0, 3):\n dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (\n dataset.Pclass == j+1), 'Age'] = guess_ages[i, j]\n\n dataset['Age'] = dataset['Age'].astype(int)\n\n\ndef normalizeEmbarked():\n freq_port = train_df.Embarked.dropna().mode()[0]\n\n for dataset in combine:\n dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)\n\n for dataset in combine:\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n\ndef normalizeFare():\n\n for dataset in combine:\n dataset.loc[(dataset['Fare'] < 9), 'Fare'] = 0\n dataset.loc[(dataset['Fare'] >= 9) & (\n dataset['Fare'] < 12), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] >= 12) & (\n dataset['Fare'] < 15), 'Fare'] = 2\n dataset.loc[(dataset['Fare'] >= 15) & (\n dataset['Fare'] < 20), 'Fare'] = 3\n dataset.loc[(dataset['Fare'] >= 20) & (\n dataset['Fare'] < 30), 'Fare'] = 4\n dataset.loc[(dataset['Fare'] >= 30) & (\n dataset['Fare'] < 55), 'Fare'] = 5\n dataset.loc[(dataset['Fare'] >= 55) & (\n dataset['Fare'] < 95), 'Fare'] = 6\n dataset.loc[(dataset['Fare'] >= 95), 'Fare'] = 7\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n\ndef normalizeAgeClass():\n for dataset in combine:\n dataset['Age*Class*Fare'] = dataset.Age * dataset.Pclass * dataset.Fare\n dataset['Age*Class'] = dataset.Age * dataset.Pclass\n dataset['Age*Fare'] = dataset.Age * dataset.Fare\n\n\ndef normalizeData():\n normalizeSex()\n normalizeAges()\n setAgeBoundaries()\n normalizeFamily()\n normalizeEmbarked()\n normalizeFare()\n normalizeAgeClass()\n\n\ndef getFareClass(data, cat):\n return data.loc[data['Fare'] == cat]\n\n\ndef main():\n global train_df\n global test_df\n global combine\n", "project_metadata": {"full_name": "bharathsudharsan/ML_Classifiers_on_MCUs", "description": null, "topics": [], "git_url": "git://github.com/bharathsudharsan/ML_Classifiers_on_MCUs.git", "stars": 9, "watchers": 9, "forks": 1, "created": "2020-11-26T17:08:13Z", "size": 596, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3328836, "C++": 2845249, "Objective-C": 31933, "C": 16143}, "last_updated": "2021-01-07T17:27:09Z"}, "intent": "# Training and Testing Data"}, {"original_comment": "# print the banner first\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pylab as pl\nimport pandas as pd\nimport numpy as np\nimport re\nimport os\nfrom tqdm import tnrange, tqdm_notebook\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport fastjet as fj\nimport pythia8\nfrom pythiafjtools import pypythiafjtools as pyfj\nfrom lundplane import pylundplane as lund\nfrom mptools import pymptools as mpt\nfrom tqdm import tqdm\nimport math\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_context('notebook')\nplt.style.use('seaborn-whitegrid')\n# %config InlineBackend.figure_format = 'retina'\n\n#%%\n\ndef create_and_init_pythia(config_strings=[]):\n pythia = pythia8.Pythia()\n for s in config_strings:\n pythia.readString(s)\n for extra_s in [\"Next:numberShowEvent = 0\", \"Next:numberShowInfo = 0\", \"Next:numberShowProcess = 0\"]:\n pythia.readString(extra_s)\n if pythia.init():\n return pythia\n return None\n\n#%%\n\nsconfig_pythia_w = [\"Beams:eCM = 5000.\", \"HardQCD:all = on\",\n \"PartonLevel:ISR = off\",\n \"PartonLevel:MPI = off\",\n \"PhaseSpace:bias2Selection=on\",\n \"PhaseSpace:bias2SelectionPow=4\",\n \"PhaseSpace:bias2SelectionRef=50\"]\nsconfig_pythia = [\"Beams:eCM = 5000.\", \"HardQCD:all = on\",\n \"PartonLevel:ISR = off\",\n \"PartonLevel:MPI = off\",\n \"PhaseSpace:pThatMin = 20\"]\npythia = create_and_init_pythia(sconfig_pythia)", "target_code": "import fastjet as fj\n\nfj.ClusterSequence.print_banner()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pylab as pl\nimport pandas as pd\nimport numpy as np\nimport re\nimport os\nfrom tqdm import tnrange, tqdm_notebook\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport pythia8\nfrom pythiafjtools import pypythiafjtools as pyfj\nfrom lundplane import pylundplane as lund\nfrom mptools import pymptools as mpt\nfrom tqdm import tqdm\nimport math\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_context('notebook')\nplt.style.use('seaborn-whitegrid')\n# %config InlineBackend.figure_format = 'retina'\n\n\ndef create_and_init_pythia(config_strings=[]):\n pythia = pythia8.Pythia()\n for s in config_strings:\n pythia.readString(s)\n for extra_s in [\"Next:numberShowEvent = 0\", \"Next:numberShowInfo = 0\", \"Next:numberShowProcess = 0\"]:\n pythia.readString(extra_s)\n if pythia.init():\n return pythia\n return None\n\n\nsconfig_pythia_w = [\"Beams:eCM = 5000.\", \"HardQCD:all = on\",\n \"PartonLevel:ISR = off\",\n \"PartonLevel:MPI = off\",\n \"PhaseSpace:bias2Selection=on\",\n \"PhaseSpace:bias2SelectionPow=4\",\n \"PhaseSpace:bias2SelectionRef=50\"]\nsconfig_pythia = [\"Beams:eCM = 5000.\", \"HardQCD:all = on\",\n \"PartonLevel:ISR = off\",\n \"PartonLevel:MPI = off\",\n \"PhaseSpace:pThatMin = 20\"]\npythia = create_and_init_pythia(sconfig_pythia)\n", "project_metadata": {"full_name": "matplo/pyjetty", "description": "some work on jets", "topics": [], "git_url": "git://github.com/matplo/pyjetty.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2019-06-20T05:50:27Z", "size": 4776, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1714841, "Python": 1458065, "Shell": 171348, "C": 109668, "C++": 68123, "CMake": 12982, "SWIG": 1749}, "last_updated": "2021-01-09T01:41:11Z"}, "intent": "# print the banner"}, {"original_comment": "# ## 2.3 Split features and targets from the data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stratified KFold+XGBoost+EDA Tutorial\n# ### **Hyungsuk Kang, Sungkyunkwan University**\n# #### 2017/10/8\n# # Outline\n# * **1. Introduction**\n# * **2. Data preparation**\n# * 2.1 Load data\n# * 2.2 Check for missing values\n# * 2.3 Split features and targets from the data\n# * 2.4 Exploratory Visualization\n# * **3. Training/Predicting Pipeline**\n# * 3.1 Define Gini metric\n# * 3.2 Drop Unnecessary Features\n# * 3.3 Stratified KFold\n# * 3.4 XGBoost\n# * **4. Prediction and submission**\n# * 4.1 Predict and Submit results\n#\n\n# # **1. Introduction**\n#\n# This is a full walkthrough for building the machine learning model for Porto Seguro\u2019s Safe Driver Prediction dataset provided by Porto Seguro. Stratified KFold is used due to inbalance of the output variable. XGBoost is used because it is like the winning ticket for classification problem with formatted data. You can check its success on this link. ([XGBoost winning solutions](https://github.com/dmlc/xgboost/tree/master/demo#machine-learning-challenge-winning-solutions)) First, I will prepare the data (driver's information and whether the driver initiated auto insurance or not) then I will focus on prediction.\n#\n# For more information on XGBoost, click this link.\n#\n# # [XGBoost](https://xgboost.readthedocs.io/en/latest/)\n#\n\n#%%\n\nimport xgboost as xgb\nfrom sklearn.model_selection import StratifiedKFold\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **2. Data Preparation**\n#\n# ## **2.1 Load Data**\n\n#%%\n\ntrain = pd.read_csv('../input/train.csv', na_values=-1)\ntest = pd.read_csv('../input/test.csv', na_values=-1)\n\n\n# ## 2.2 Check for missing values(NaN)\n\n#%%\n\ntrain.isnull().values.any()\n\n\n# ### Fill it with median value of the column\n#\n# this does not harm the distribution of the model", "target_code": "features = train.drop(['id', 'target'], axis=1).values\ntargets = train.target.values\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stratified KFold+XGBoost+EDA Tutorial\n# ### **Hyungsuk Kang, Sungkyunkwan University**\n# #### 2017/10/8\n# # Outline\n# * **1. Introduction**\n# * **2. Data preparation**\n# * 2.1 Load data\n# * 2.2 Check for missing values\n# * 2.3 Split features and targets from the data\n# * 2.4 Exploratory Visualization\n# * **3. Training/Predicting Pipeline**\n# * 3.1 Define Gini metric\n# * 3.2 Drop Unnecessary Features\n# * 3.3 Stratified KFold\n# * 3.4 XGBoost\n# * **4. Prediction and submission**\n# * 4.1 Predict and Submit results\n#\n\n# # **1. Introduction**\n#\n# This is a full walkthrough for building the machine learning model for Porto Seguro\u2019s Safe Driver Prediction dataset provided by Porto Seguro. Stratified KFold is used due to inbalance of the output variable. XGBoost is used because it is like the winning ticket for classification problem with formatted data. You can check its success on this link. ([XGBoost winning solutions](https://github.com/dmlc/xgboost/tree/master/demo#machine-learning-challenge-winning-solutions)) First, I will prepare the data (driver's information and whether the driver initiated auto insurance or not) then I will focus on prediction.\n#\n# For more information on XGBoost, click this link.\n#\n# # [XGBoost](https://xgboost.readthedocs.io/en/latest/)\n#\n\n\nimport xgboost as xgb\nfrom sklearn.model_selection import StratifiedKFold\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **2. Data Preparation**\n#\n# ## **2.1 Load Data**\n\n\ntrain = pd.read_csv('../input/train.csv', na_values=-1)\ntest = pd.read_csv('../input/test.csv', na_values=-1)\n\n\n# ## 2.2 Check for missing values(NaN)\n\n\ntrain.isnull().values.any()\n\n\n# ### Fill it with median value of the column\n#\n# this does not harm the distribution of the model\n\n\n\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# 2.3 Split features and targets from the data"}, {"original_comment": "# perform a Kronecker product\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Python Basics\n# **by [Jason DeBacker](http://jasondebacker.com), August 2017**\n#\n# This Jupyter Notebook is intended to introduce students to Python, with a particular focus on functionality that has direct application to economic modeling and econometrics. This Notebook will cover built-in object types some aspects of the Standard Library, as well and an introduction to NumPy and Pandas.\n#\n#\n# ## Built-in Types\n# There are several data types in the standard library. This include:\n# * Numeric Types\n# * int\n# * float\n# * complex\n# * Booleans\n# * Sequence Types\n# * Strings\n# * Lists\n# * Tuples\n# * Ranges\n# * Set Types\n# * Sets\n# * Mapping Types\n# * Dictionaries\n#\n# ### Numeric Types\n\n#%%\n\n# You do not need to declare the numeric type - python will infer it from the value of the object\n# you can always check the type of an object\nimport pandas as pd\nimport numpy as np\nthree = 3\ntype(three)\n\n\n# A few notes on the vocaulary of python here. The variables is `three`. Is the name for the object that is `3`. A name is a reference to an object. In this case, the name `three` references an object that is the number 3.\n#\n# A \"namespace\" is thus the collection of names (i.e., variables) that point to some objects. In particular, the namespace in Python is a dictionary that maps each variable name to is value (the object). More on dictionaries below.\n\n#%%\n\n# But you can cast an object as a certain type\nthree = float(3)\ntype(three)\n\n#%%\n\n# Even as a non-numeric type\nthree = str(3)\ntype(three)\n\n\n# Numeric types support most basic mathematical operations. They mostly have obvious names or use standard symbols. For example:\n\n#%%\n\nx = 3\ny = 5\n# addition\nprint(x+y)\n\n# subtraction\nprint(x-y)\n\n# mulitiplication\nprint(x*y)\n\n# division\nprint(x/y)\n\n# absolute value\nabs(x-y)\n\n\n# An exception to this may be raising a number to a power.\n\n#%%\n\n# raising x to the y\nx ** y\n\n\n# ## Booleans\n#\n# Boolean objects are special types of integers. You can assign them value of `True` or `False`, which are equivalent to `1` and `0`, respectively.\n\n#%%\n\n# By setting the variable equal to True/False, Python will set it as the Boolean type\ndrop_out = True\ndrop_out\n\n#%%\n\n# Test if drop_out equals 1\ndrop_out == 1\n\n#%%\n\ndrop_out == 0\n\n\n# ## Sequence types\n#\n# ### Strings\n#\n# Strings are handled as a sequence of characters. Seems odd, but let's look at what this means.\n\n#%%\n\n# A string must be enclosed in single or double quotes\nname = 'Jason'\nname\n\n#%%\n\ntype(name)\n\n\n# #### Indexing\n#\n# Indexing refers to selecting an element (or \"slice\") or a sequence by referencing it's index. It's important to remmber that Python using 0-indexing. So the index of the first element in a sequence is 0.\n#\n# Are can select an element(s) from a sequence by referencing the index, including from strings.\n\n#%%\n\nprint('First letter = ', name[0])\n\n#%%\n\n# You can use a colon return a slice of a sequence based on index values (remember the 0-indexing!)\nprint('First 3 letters = ', name[0:3])\n\n\n# But notice that `0:3` doesn't pull the first four letters. This notation saying select from the 0th indexed element up to (but not including) the element with index 3.\n#\n# Futhermore, if you leave the left side of the colon emptpy, that means take the elements from the first up to (but not including) the index on the right side of the colon.\n\n#%%\n\nname[:3]\n\n#%%\n\n# And to get from the element with index 3 to the last, do\nname[3:]\n\n\n# And a colon on it's own takes all elements from the sequence (you wouldn't use this with a one-dimensional object like the string, but it's useful for slicing multi-dimensional arrays)\n\n#%%\n\nname[:]\n\n\n# There are many built-in functions that can be performed with strings. Some examples:\n\n#%%\n\nname + ' DeBacker'\n\n#%%\n\nname * 2\n\n#%%\n\nname.upper()\n\n#%%\n\nname.lower()\n\n\n# ### Lists\n#\n# Lists are ordered collections of objects. A list can be comprised of objects that are all the same type of different objects. But typically lists contain objects of similar types. One feature of lists is that they are \"mutable\", which means that you can change the object (in this case a list) after it's been created.\n\n#%%\n\n# To define a list, use square brackets\nnum_list = [1, 2, 3]\nnum_list\n\n#%%\n\n# Lists can repeat objects that have the same value\nnum_list2 = [1, 2, 3, 2]\nnum_list2\n\n#%%\n\n# You can reference elememts of a list and slice lists just as we did with strings\nnum_list2[3]\n\n#%%\n\nnum_list2[:3]\n\n#%%\n\n# You can also easily iterate over a list\nfor item in num_list:\n print(item)\n\n#%%\n\n# In fact you can do this with any sequence, including strings\nfor item in name:\n print(item)\n\n\n# There are a bunch of operations you can do with lists. A few examples:\n\n#%%\n\n# addition appends lists together\nnum_list + num_list2\n\n#%%\n\n# multiplication repeasts the list n times in one list\nnum_list * 2\n\n#%%\n\n# Reversing a list\nnum_list[::-1]\n\n#%%\n\n# Deleting an element by index\ndel(num_list[1])\nnum_list\n\n#%%\n\n# Deleting an element by value (deletes the first instance of that value in the list)\nnum_list2.remove(2)\nnum_list2\n\n#%%\n\n# showing how a list if mutable\n# we can change the 3rd element in num_list2 through the following assignement\nnum_list2[2] = 99\nnum_list2\n\n\n# ### Tuples\n#\n# Types are ordered collections of objects. They can contain objects of the same of differnt types and more often contain distinct types than do lists. Tuples major difference from lists are that they are \"immutable\" or cannot be changed after assignment.\n\n#%%\n\n# A tuple is defined by places a sequence of objects in parentheses\nnum_tuple = (1, 2, 3)\nnum_tuple\n\n#%%\n\n# Showing that a tuple is immutable\n# Try to change the 3rd element of the tuple\nnum_tuple[2] = 99\nnum_tuple\n\n#%%\n\n# But indexing and slicing works the same as we saw with other sequences\nnum_tuple[2]\n\n#%%\n\nnum_tuple[:1]\n\n#%%\n\n# and iterating over the sequence\nfor item in num_tuple:\n print(item)\n\n#%%\n\n# appending tuples works through addition\nnum_tuple + (4, 5)\n\n#%%\n\n# But deleting and removing elements don't - remember, immutable!\ndel(num_tuple[2])\n\n#%%\n\nnum_tuple.remove(2)\n\n\n# ### Ranges\n#\n# The last sequence type we'll discuss are ranges. Ranges are immutable sequences of numbers, typically used in `for` loops.\n\n#%%\n\nrange(3)\n\n#%%\n\n# notice how the range starts are 0 by default and range(n) contains n elements\nfor i in range(3):\n print(i)\n\n#%%\n\n# by range() has other arguments and we can start at not zero\n# range(x, y) will start at x and go up to (but not include y)\nfor i in range(10, 15):\n print(i)\n\n#%%\n\n# and range() can accept a third argument, that is the step size\n# so if we want to count down from 15 to 10 (but not including 10), we can do so with a step size = -1\nfor i in range(15, 10, -1):\n print(i)\n\n\n# ### Sets\n#\n# A set is a collection of *unordered* and *unique* objects. Because sets are unorders, they do not support indexing or slicing as the sequence tpes do. Sets are mutable (although there is another object type, a frozenset, that is like a set, but is immutable). These properties of sets make them very useful for testing membership or finding unique groups of values.\n#\n# Sets are defined with curly brackets.\n\n#%%\n\nnum_set = {1, 2, 3}\nnum_set\n\n#%%\n\n# Look what happens when values are repeated and numbers entered not in order\nnum_set2 = {5, 1, 2, 3, 2, 3}\nnum_set2\n\n#%%\n\n# since they are mutable, we can remove elements from sets\nnum_set2.remove(2)\nnum_set2\n\n#%%\n\n# because they contain only unique values, you can do some membership testing\n# e.g.\n5 in num_set2\n\n\n# ### Dictionaries\n#\n# Dictionaries contain key-value pairs. Keys provide the pointer to an associated value. Typically keys are strings, but they can be numeric (or even other types - but not dictionaries or lists or other mutable types). Values can be any type. Thus dictionaries are very valuable as providing mappings from a key word to a value.\n#\n# Dictionaries are created by placing a comma-separated list of key: value pairs within curly braces, for example:\n\n#%%\n\naddress_dict = {'street': 'Green St.',\n 'number': 1014, 'city': 'Columbia', 'state': 'SC'}\naddress_dict\n\n#%%\n\n# you reference a particular element of a dictionary by indexing with the key\naddress_dict['city']\n\n#%%\n\n# you can iterate over the keys or the keys and value of a dictionary\n# note the .items()\nfor key, value in address_dict.items():\n print('Key = ', key)\n print('Value = ', value)\n\n#%%\n\n# iterating only over the keys\n# note the .keys() method\nfor key in address_dict.keys():\n print(key)\n\n#%%\n\n# iterating only over the values\n# note the .values() method\nfor value in address_dict.values():\n print(value)\n\n#%%\n\n# just to show that you don't need to use the name key or value for the item in the list that's iterated over\nfor xyz in address_dict.values():\n print(xyz)\n\n\n# ## NumPy\n#\n# NumPy is a Python package that is important for economics applications and scientific computing in general. It allows you to define N-dimensional arrays, use sophisticated (broadcasting) functions, more easily integrate C/C++ and Fortran code, and provides useful linear algebra, Fourier transform, and random number capabilities. In this sense, you can think of it as bringing the standard Matlab functionality into Python.\n#\n# NumPy is a Python package, it's not part of the standard library or Python itself. Since you installed the Anaconda distribution of Python, you installed Python along with a number of packages, including Numpy. However, if we want to have access to the functionality of Numpy, we need to import NumPy into our Python session.\n#\n# To do this:\n\n#%%\n\n# note the \"as\" provide an alias for referencing numpy - so instead of typing numpy, we can just type np\n\n\n# Now NumPy is available in this iPython session running with this notebook!!\n#\n# Let's start using NumPy by creating a NumPy array. You can think of an array as a list of lists. E.g., a 2-D matrix can be thought of as a list of rows, where each row is a list of elements (the items in each column in that row). In fact, you could just make a list of lists with they built-in list type rather than using a NumPy array. But then matrix operations would involved a lot of (slow) loops. NumPy arrays will allow you to do matrix operations with NumPy methods that leverage optimally compiled code so that they run more quickly.\n#\n# But since an array is a list of lists and lists are defined using square brackets, we'll use similar syntax to define a Numpy array:\n\n#%%\n\nA = np.array([[1, 2], [3, 4]])\nA\n\n#%%\n\n# what is the shape of A?\nA.shape\n\n\n# We can do a whole bunch of mathemical operations on these arrays. By default, these are performed element by element.\n#\n# For example:\n\n#%%\n\nA + 2\n\n#%%\n\nA * 2\n\n#%%\n\nA / 2\n\n#%%\n\nA ** 2\n\n\n# If we want to do matrix operations, we need to call NumPy methods.\n\n#%%\n\n# perform a dot product\nnp.dot(A, A)\n\n#%%", "target_code": "np.kron(A, A)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Python Basics\n# **by [Jason DeBacker](http://jasondebacker.com), August 2017**\n#\n# This Jupyter Notebook is intended to introduce students to Python, with a particular focus on functionality that has direct application to economic modeling and econometrics. This Notebook will cover built-in object types some aspects of the Standard Library, as well and an introduction to NumPy and Pandas.\n#\n#\n# ## Built-in Types\n# There are several data types in the standard library. This include:\n# * Numeric Types\n# * int\n# * float\n# * complex\n# * Booleans\n# * Sequence Types\n# * Strings\n# * Lists\n# * Tuples\n# * Ranges\n# * Set Types\n# * Sets\n# * Mapping Types\n# * Dictionaries\n#\n# ### Numeric Types\n\n\n# You do not need to declare the numeric type - python will infer it from the value of the object\n# you can always check the type of an object\nimport pandas as pd\nimport numpy as np\nthree = 3\ntype(three)\n\n\n# A few notes on the vocaulary of python here. The variables is `three`. Is the name for the object that is `3`. A name is a reference to an object. In this case, the name `three` references an object that is the number 3.\n#\n# A \"namespace\" is thus the collection of names (i.e., variables) that point to some objects. In particular, the namespace in Python is a dictionary that maps each variable name to is value (the object). More on dictionaries below.\n\n\n# But you can cast an object as a certain type\nthree = float(3)\ntype(three)\n\n\n# Even as a non-numeric type\nthree = str(3)\ntype(three)\n\n\n# Numeric types support most basic mathematical operations. They mostly have obvious names or use standard symbols. For example:\n\n\nx = 3\ny = 5\n# addition\nprint(x+y)\n\n# subtraction\nprint(x-y)\n\n# mulitiplication\nprint(x*y)\n\n# division\nprint(x/y)\n\n# absolute value\nabs(x-y)\n\n\n# An exception to this may be raising a number to a power.\n\n\n# raising x to the y\nx ** y\n\n\n# ## Booleans\n#\n# Boolean objects are special types of integers. You can assign them value of `True` or `False`, which are equivalent to `1` and `0`, respectively.\n\n\n# By setting the variable equal to True/False, Python will set it as the Boolean type\ndrop_out = True\ndrop_out\n\n\n# Test if drop_out equals 1\ndrop_out == 1\n\n\ndrop_out == 0\n\n\n# ## Sequence types\n#\n# ### Strings\n#\n# Strings are handled as a sequence of characters. Seems odd, but let's look at what this means.\n\n\n# A string must be enclosed in single or double quotes\nname = 'Jason'\nname\n\n\ntype(name)\n\n\n# #### Indexing\n#\n# Indexing refers to selecting an element (or \"slice\") or a sequence by referencing it's index. It's important to remmber that Python using 0-indexing. So the index of the first element in a sequence is 0.\n#\n# Are can select an element(s) from a sequence by referencing the index, including from strings.\n\n\nprint('First letter = ', name[0])\n\n\n# You can use a colon return a slice of a sequence based on index values (remember the 0-indexing!)\nprint('First 3 letters = ', name[0:3])\n\n\n# But notice that `0:3` doesn't pull the first four letters. This notation saying select from the 0th indexed element up to (but not including) the element with index 3.\n#\n# Futhermore, if you leave the left side of the colon emptpy, that means take the elements from the first up to (but not including) the index on the right side of the colon.\n\n\nname[:3]\n\n\n# And to get from the element with index 3 to the last, do\nname[3:]\n\n\n# And a colon on it's own takes all elements from the sequence (you wouldn't use this with a one-dimensional object like the string, but it's useful for slicing multi-dimensional arrays)\n\n\nname[:]\n\n\n# There are many built-in functions that can be performed with strings. Some examples:\n\n\nname + ' DeBacker'\n\n\nname * 2\n\n\nname.upper()\n\n\nname.lower()\n\n\n# ### Lists\n#\n# Lists are ordered collections of objects. A list can be comprised of objects that are all the same type of different objects. But typically lists contain objects of similar types. One feature of lists is that they are \"mutable\", which means that you can change the object (in this case a list) after it's been created.\n\n\n# To define a list, use square brackets\nnum_list = [1, 2, 3]\nnum_list\n\n\n# Lists can repeat objects that have the same value\nnum_list2 = [1, 2, 3, 2]\nnum_list2\n\n\n# You can reference elememts of a list and slice lists just as we did with strings\nnum_list2[3]\n\n\nnum_list2[:3]\n\n\n# You can also easily iterate over a list\nfor item in num_list:\n print(item)\n\n\n# In fact you can do this with any sequence, including strings\nfor item in name:\n print(item)\n\n\n# There are a bunch of operations you can do with lists. A few examples:\n\n\n# addition appends lists together\nnum_list + num_list2\n\n\n# multiplication repeasts the list n times in one list\nnum_list * 2\n\n\n# Reversing a list\nnum_list[::-1]\n\n\n# Deleting an element by index\ndel(num_list[1])\nnum_list\n\n\n# Deleting an element by value (deletes the first instance of that value in the list)\nnum_list2.remove(2)\nnum_list2\n\n\n# showing how a list if mutable\n# we can change the 3rd element in num_list2 through the following assignement\nnum_list2[2] = 99\nnum_list2\n\n\n# ### Tuples\n#\n# Types are ordered collections of objects. They can contain objects of the same of differnt types and more often contain distinct types than do lists. Tuples major difference from lists are that they are \"immutable\" or cannot be changed after assignment.\n\n\n# A tuple is defined by places a sequence of objects in parentheses\nnum_tuple = (1, 2, 3)\nnum_tuple\n\n\n# Showing that a tuple is immutable\n# Try to change the 3rd element of the tuple\nnum_tuple[2] = 99\nnum_tuple\n\n\n# But indexing and slicing works the same as we saw with other sequences\nnum_tuple[2]\n\n\nnum_tuple[:1]\n\n\n# and iterating over the sequence\nfor item in num_tuple:\n print(item)\n\n\n# appending tuples works through addition\nnum_tuple + (4, 5)\n\n\n# But deleting and removing elements don't - remember, immutable!\ndel(num_tuple[2])\n\n\nnum_tuple.remove(2)\n\n\n# ### Ranges\n#\n# The last sequence type we'll discuss are ranges. Ranges are immutable sequences of numbers, typically used in `for` loops.\n\n\nrange(3)\n\n\n# notice how the range starts are 0 by default and range(n) contains n elements\nfor i in range(3):\n print(i)\n\n\n# by range() has other arguments and we can start at not zero\n# range(x, y) will start at x and go up to (but not include y)\nfor i in range(10, 15):\n print(i)\n\n\n# and range() can accept a third argument, that is the step size\n# so if we want to count down from 15 to 10 (but not including 10), we can do so with a step size = -1\nfor i in range(15, 10, -1):\n print(i)\n\n\n# ### Sets\n#\n# A set is a collection of *unordered* and *unique* objects. Because sets are unorders, they do not support indexing or slicing as the sequence tpes do. Sets are mutable (although there is another object type, a frozenset, that is like a set, but is immutable). These properties of sets make them very useful for testing membership or finding unique groups of values.\n#\n# Sets are defined with curly brackets.\n\n\nnum_set = {1, 2, 3}\nnum_set\n\n\n# Look what happens when values are repeated and numbers entered not in order\nnum_set2 = {5, 1, 2, 3, 2, 3}\nnum_set2\n\n\n# since they are mutable, we can remove elements from sets\nnum_set2.remove(2)\nnum_set2\n\n\n# because they contain only unique values, you can do some membership testing\n# e.g.\n5 in num_set2\n\n\n# ### Dictionaries\n#\n# Dictionaries contain key-value pairs. Keys provide the pointer to an associated value. Typically keys are strings, but they can be numeric (or even other types - but not dictionaries or lists or other mutable types). Values can be any type. Thus dictionaries are very valuable as providing mappings from a key word to a value.\n#\n# Dictionaries are created by placing a comma-separated list of key: value pairs within curly braces, for example:\n\n\naddress_dict = {'street': 'Green St.',\n 'number': 1014, 'city': 'Columbia', 'state': 'SC'}\naddress_dict\n\n\n# you reference a particular element of a dictionary by indexing with the key\naddress_dict['city']\n\n\n# you can iterate over the keys or the keys and value of a dictionary\n# note the .items()\nfor key, value in address_dict.items():\n print('Key = ', key)\n print('Value = ', value)\n\n\n# iterating only over the keys\n# note the .keys() method\nfor key in address_dict.keys():\n print(key)\n\n\n# iterating only over the values\n# note the .values() method\nfor value in address_dict.values():\n print(value)\n\n\n# just to show that you don't need to use the name key or value for the item in the list that's iterated over\nfor xyz in address_dict.values():\n print(xyz)\n\n\n# ## NumPy\n#\n# NumPy is a Python package that is important for economics applications and scientific computing in general. It allows you to define N-dimensional arrays, use sophisticated (broadcasting) functions, more easily integrate C/C++ and Fortran code, and provides useful linear algebra, Fourier transform, and random number capabilities. In this sense, you can think of it as bringing the standard Matlab functionality into Python.\n#\n# NumPy is a Python package, it's not part of the standard library or Python itself. Since you installed the Anaconda distribution of Python, you installed Python along with a number of packages, including Numpy. However, if we want to have access to the functionality of Numpy, we need to import NumPy into our Python session.\n#\n# To do this:\n\n\n# note the \"as\" provide an alias for referencing numpy - so instead of typing numpy, we can just type np\n\n\n# Now NumPy is available in this iPython session running with this notebook!!\n#\n# Let's start using NumPy by creating a NumPy array. You can think of an array as a list of lists. E.g., a 2-D matrix can be thought of as a list of rows, where each row is a list of elements (the items in each column in that row). In fact, you could just make a list of lists with they built-in list type rather than using a NumPy array. But then matrix operations would involved a lot of (slow) loops. NumPy arrays will allow you to do matrix operations with NumPy methods that leverage optimally compiled code so that they run more quickly.\n#\n# But since an array is a list of lists and lists are defined using square brackets, we'll use similar syntax to define a Numpy array:\n\n\nA = np.array([[1, 2], [3, 4]])\nA\n\n\n# what is the shape of A?\nA.shape\n\n\n# We can do a whole bunch of mathemical operations on these arrays. By default, these are performed element by element.\n#\n# For example:\n\n\nA + 2\n\n\nA * 2\n\n\nA / 2\n\n\nA ** 2\n\n\n# If we want to do matrix operations, we need to call NumPy methods.\n\n\n# perform a dot product\nnp.dot(A, A)\n\n", "project_metadata": {"full_name": "jdebacker/CompEcon_Fall17", "description": "ECON 815: Computational Methods for Economists", "topics": [], "git_url": "git://github.com/jdebacker/CompEcon_Fall17.git", "stars": 39, "watchers": 39, "forks": 40, "created": "2017-08-02T18:15:37Z", "size": 42405, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3999743, "Python": 7995}, "last_updated": "2020-10-28T12:32:14Z"}, "intent": "# perform a Kronecker product"}, {"original_comment": "# Getting the transposed array\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Before we start...\n# http://www.numpy.org/\nimport numpy\n\n#%%\n\n# Generating numbers\narange = numpy.arange(15)\n\nprint(arange)\n\n#%%\n\n# Creating an array from a range\narray = arange.reshape(3, 5)\nprint(array)", "target_code": "print(numpy.transpose(array))\nprint(array.T)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Before we start...\n# http://www.numpy.org/\nimport numpy\n\n\n# Generating numbers\narange = numpy.arange(15)\n\nprint(arange)\n\n\n# Creating an array from a range\narray = arange.reshape(3, 5)\nprint(array)\n", "project_metadata": {"full_name": "biancarosa/pybr-neural-nets", "description": ":beers: jupyter notebooks to support my python brasil presentation", "topics": [], "git_url": "git://github.com/biancarosa/pybr-neural-nets.git", "stars": 6, "watchers": 6, "forks": 2, "created": "2017-09-20T12:40:20Z", "size": 19829, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 69052}, "last_updated": "2019-10-31T19:01:48Z"}, "intent": "# Getting the transposed array"}, {"original_comment": "# #### 26. Exponential function based on natural logarithm function\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 100 NumPy Exercises\n\n# ## Introduction\n\n# ![numpylogo.png](attachment:numpylogo.png)\n#\n# **NumPy** is an extension library for Python language, supporting operations of a large number of high-dimensional arrays and matrices. In addition, it also provides a large number of mathematical function libraries for array operations. Machine learning involves a lot of transformations and operations on arrays, which makes NumPy one of the essential tools.\n#\n# **100 NumPy Exercises** is divided into _basic part_ and _advanced part_, each with 50 exercises. The basic part of the exercise helps you familiarize yourself with the use of NumPy's common methods and the advanced part focuses on the combined application of the NumPy methods.\n#\n# If you already have a foundation for NumPy before you take the course, you can review it in the following cells. If you are unfamiliar with NumPy, make sure you **manually** practice in the blank cells below each example cell.\n\n# ### Knowledge Points\n#\n# The main points covered in this experiment are:\n# - Creating an array\n# - Array operation\n# - Mathematical functions\n# - Array slices and indexes\n# - Array shape operations\n# - Array sorting\n# - Array statistics\n\n# ### Environment\n#\n# - Python 3.8\n# - NumPy 1.18.1\n\n# ### Catalog Index\n#\n# - 1. Basic Part\n# - 2. Advanced Part\n# - 3. Summary\n\n# ---\n\n# ## 1. Basic Part\n\n# ### Import NumPy\n\n# #### 1. Import NumPy\n# Before practicing **NumPy**, first you need to import the `numpy` module, and the convention is abbreviated as `np`:\n\n#%%\n\nimport requests\nimport PIL\nfrom matplotlib import pyplot as plt\nfrom PIL import Image\nfrom io import BytesIO\nimport numpy as np\n\n#%%\n\n# Repeat the code practice in the blank cell, do not just copy and paste!\nimport numpy as np\n\n\n# #### 2. View NumPy version\n\n#%%\n\nprint(np.__version__)\n\n#%%\n\nprint(np.__version__)\n\n\n# ### Create an Array\n\n# The main object of **NumPy** is the multidimensional array `ndarray`. In NumPy, `dimensions` are called `axes`, and the number of axes is called `rank`.\n#\n# For example, the array below is an array of rank 1, because it has only one axis and the length of the axis is 3:\n# ```\n# [1, 2, 3]\n# ```\n#\n# For another example, the rank of this array is 2. The first dimension length is 2 and the second dimension length is 3:\n#\n# ```\n# [[1., 2., 3.],\n# \u00a0[4., 5., 6.]]\n# ```\n\n# #### 3. Create a one-dimensional array from a list\n#\n# **Note:** `numpy.array` is not the same as the Python standard library `array.array`. The former is more powerful, which is one of the important reasons why we learn `Numpy`:\n\n#%%\n\nnp.array([1, 2, 3])\n\n#%%\n\nnp.array([1.2, 4.6, 10, 3])\n\n\n# #### 4. Create a two-dimensional array from a list\n\n#%%\n\nnp.array([(1, 2, 3), (4, 5, 6)])\n\n#%%\n\nA = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12],\n [13, 14, 15, 16, 17, 18]])\n\n#%%\n\nprint(A)\n\n\n# #### 5. Create a two-dimensional array full of `0`\n\n#%%\n\nnp.zeros((3, 3))\n\n#%%\n\nprint(np.zeros((2, 2, 2)))\n\n\n# #### 6. Create a three-dimensional array full of `1`\n\n#%%\n\nnp.ones((2, 3, 4))\n\n#%%\n\nnp.ones((2, 3, 4, 5))\n\n\n# ```javascript\n# Note: Please think about the dimensional relationships of the above 4 arrays.\n# ```\n\n# #### 7. Create a one-dimensional arithmetic progression array\n\n#%%\n\nnp.arange(5)\n\n#%%\n\nprint(np.arange(3, 10))\n\n\n# #### 8. Create a two-Dimensional arithmetic progression array\n\n#%%\n\nnp.arange(6).reshape(2, 3)\n\n#%%\n\nprint(np.arange(15).reshape(3, 5))\n\n\n# #### 9. Create a unit matrix (two-dimensional array)\n\n#%%\n\nnp.eye(3)\n\n#%%\n\nnp.eye(5)\n\n\n# #### 10. Create equally spaced one-dimensional array\n\n#%%\n\nnp.linspace(1, 10, num=6)\n\n#%%\n\nnp.linspace(3, 20, num=7)\n\n\n# #### 11. Create two-dimensional random array\n\n#%%\n\nnp.random.rand(2, 3)\n\n#%%\n\nnp.random.rand(3, 5)\n\n\n# #### 12. Create two-dimensional random integer array (value < 5)\n\n#%%\n\nnp.random.randint(5, size=(2, 3))\n\n#%%\n\nnp.random.randint(10, size=(3, 5))\n\n\n# #### 13. Create an array based on a custom function\n\n#%%\n\nnp.fromfunction(lambda i, j: i + j, (3, 3))\n\n#%%\n\nnp.fromfunction(lambda i, j: 7*i+j, (2, 3))\n\n\n# ### Array Operations\n\n# #### Generate a one-dimensional example array\n\n#%%\n\na = np.array([10, 20, 30, 40, 50])\nb = np.arange(1, 6)\na, b\n\n#%%\n\na2 = np.array([10, 20, 30, 40, 50, 60, 70])\nb2 = np.arange(1, 8)\na2, b2\n\n\n# #### 14. One-dimensional array addition\n#\n\n#%%\n\na + b\n\n#%%\n\na2+b2\n\n\n# #### 15. One-dimensional array subtraction\n\n#%%\n\na - b\n\n#%%\n\na2-b2\n\n\n# #### 16. One-dimensional array multiplication\n\n#%%\n\na * b\n\n#%%\n\na2*b2\n\n\n# #### 17. One-dimensional array division\n\n#%%\n\na / b\n\n#%%\n\na2/b2\n\n\n# #### Generate a two-dimensional example array (can be seen as a matrix)\n\n#%%\n\nA = np.array([[1, 2],\n [3, 4]])\nB = np.array([[5, 6],\n [7, 8]])\nA, B\n\n#%%\n\nA2 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\nB2 = np.arange(4, 13).reshape(3, 3)\nprint(A2, '\\n', B2)\n\n\n# #### 18. Matrix addition\n\n#%%\n\nA + B\n\n#%%\n\nA2+B2\n\n\n# #### 19. Matrix subtraction\n\n#%%\n\nA - B\n\n#%%\n\nA2-B2\n\n\n# #### 20. Multiplication between matrix elements\n\n#%%\n\nA * B\n\n#%%\n\nA2*B2\n\n\n# #### 21. Matrix multiplication (Note the difference between 21 and 20)\n\n#%%\n\nnp.dot(A, B)\n\n#%%\n\nnp.dot(A2, B2)\n\n#%%\n\n# If you use np.mat to accurately define a two-dimensional array as a matrix, you can use *\n# to complete the matrix multiplication.\nnp.mat(A) * np.mat(B)\n\n#%%\n\nnp.mat(A2)*np.mat(B2)\n\n\n# #### 22. Multiplication between matrix and number\n\n#%%\n\n2 * A\n\n#%%\n\n2.5*A2\n\n\n# #### 23. Transpose of matrix\n\n#%%\n\nA.T\n\n#%%\n\nA2.T\n\n\n# #### 24. Matrix inversion\n\n#%%\n\nnp.linalg.inv(A)\n\n#%%\n\nnp.linalg.inv(A2)\n\n\n# ### Mathematical Functions\n\n# #### 25. Trigonometric functions\n\n#%%\n\nprint(a)\n\nnp.sin(a)\n\n#%%\n\nprint(a2)\nnp.cos(a2)\n\n#%%\n\nnp.sin((1/2)*np.pi)", "target_code": "np.exp(a)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 100 NumPy Exercises\n\n# ## Introduction\n\n# ![numpylogo.png](attachment:numpylogo.png)\n#\n# **NumPy** is an extension library for Python language, supporting operations of a large number of high-dimensional arrays and matrices. In addition, it also provides a large number of mathematical function libraries for array operations. Machine learning involves a lot of transformations and operations on arrays, which makes NumPy one of the essential tools.\n#\n# **100 NumPy Exercises** is divided into _basic part_ and _advanced part_, each with 50 exercises. The basic part of the exercise helps you familiarize yourself with the use of NumPy's common methods and the advanced part focuses on the combined application of the NumPy methods.\n#\n# If you already have a foundation for NumPy before you take the course, you can review it in the following cells. If you are unfamiliar with NumPy, make sure you **manually** practice in the blank cells below each example cell.\n\n# ### Knowledge Points\n#\n# The main points covered in this experiment are:\n# - Creating an array\n# - Array operation\n# - Mathematical functions\n# - Array slices and indexes\n# - Array shape operations\n# - Array sorting\n# - Array statistics\n\n# ### Environment\n#\n# - Python 3.8\n# - NumPy 1.18.1\n\n# ### Catalog Index\n#\n# - 1. Basic Part\n# - 2. Advanced Part\n# - 3. Summary\n\n# ---\n\n# ## 1. Basic Part\n\n# ### Import NumPy\n\n# #### 1. Import NumPy\n# Before practicing **NumPy**, first you need to import the `numpy` module, and the convention is abbreviated as `np`:\n\n\nimport requests\nimport PIL\nfrom matplotlib import pyplot as plt\nfrom PIL import Image\nfrom io import BytesIO\nimport numpy as np\n\n\n# Repeat the code practice in the blank cell, do not just copy and paste!\nimport numpy as np\n\n\n# #### 2. View NumPy version\n\n\nprint(np.__version__)\n\n\nprint(np.__version__)\n\n\n# ### Create an Array\n\n# The main object of **NumPy** is the multidimensional array `ndarray`. In NumPy, `dimensions` are called `axes`, and the number of axes is called `rank`.\n#\n# For example, the array below is an array of rank 1, because it has only one axis and the length of the axis is 3:\n# ```\n# [1, 2, 3]\n# ```\n#\n# For another example, the rank of this array is 2. The first dimension length is 2 and the second dimension length is 3:\n#\n# ```\n# [[1., 2., 3.],\n# \u00a0[4., 5., 6.]]\n# ```\n\n# #### 3. Create a one-dimensional array from a list\n#\n# **Note:** `numpy.array` is not the same as the Python standard library `array.array`. The former is more powerful, which is one of the important reasons why we learn `Numpy`:\n\n\nnp.array([1, 2, 3])\n\n\nnp.array([1.2, 4.6, 10, 3])\n\n\n# #### 4. Create a two-dimensional array from a list\n\n\nnp.array([(1, 2, 3), (4, 5, 6)])\n\n\nA = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12],\n [13, 14, 15, 16, 17, 18]])\n\n\nprint(A)\n\n\n# #### 5. Create a two-dimensional array full of `0`\n\n\nnp.zeros((3, 3))\n\n\nprint(np.zeros((2, 2, 2)))\n\n\n# #### 6. Create a three-dimensional array full of `1`\n\n\nnp.ones((2, 3, 4))\n\n\nnp.ones((2, 3, 4, 5))\n\n\n# ```javascript\n# Note: Please think about the dimensional relationships of the above 4 arrays.\n# ```\n\n# #### 7. Create a one-dimensional arithmetic progression array\n\n\nnp.arange(5)\n\n\nprint(np.arange(3, 10))\n\n\n# #### 8. Create a two-Dimensional arithmetic progression array\n\n\nnp.arange(6).reshape(2, 3)\n\n\nprint(np.arange(15).reshape(3, 5))\n\n\n# #### 9. Create a unit matrix (two-dimensional array)\n\n\nnp.eye(3)\n\n\nnp.eye(5)\n\n\n# #### 10. Create equally spaced one-dimensional array\n\n\nnp.linspace(1, 10, num=6)\n\n\nnp.linspace(3, 20, num=7)\n\n\n# #### 11. Create two-dimensional random array\n\n\nnp.random.rand(2, 3)\n\n\nnp.random.rand(3, 5)\n\n\n# #### 12. Create two-dimensional random integer array (value < 5)\n\n\nnp.random.randint(5, size=(2, 3))\n\n\nnp.random.randint(10, size=(3, 5))\n\n\n# #### 13. Create an array based on a custom function\n\n\nnp.fromfunction(lambda i, j: i + j, (3, 3))\n\n\nnp.fromfunction(lambda i, j: 7*i+j, (2, 3))\n\n\n# ### Array Operations\n\n# #### Generate a one-dimensional example array\n\n\na = np.array([10, 20, 30, 40, 50])\nb = np.arange(1, 6)\na, b\n\n\na2 = np.array([10, 20, 30, 40, 50, 60, 70])\nb2 = np.arange(1, 8)\na2, b2\n\n\n# #### 14. One-dimensional array addition\n#\n\n\na + b\n\n\na2+b2\n\n\n# #### 15. One-dimensional array subtraction\n\n\na - b\n\n\na2-b2\n\n\n# #### 16. One-dimensional array multiplication\n\n\na * b\n\n\na2*b2\n\n\n# #### 17. One-dimensional array division\n\n\na / b\n\n\na2/b2\n\n\n# #### Generate a two-dimensional example array (can be seen as a matrix)\n\n\nA = np.array([[1, 2],\n [3, 4]])\nB = np.array([[5, 6],\n [7, 8]])\nA, B\n\n\nA2 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\nB2 = np.arange(4, 13).reshape(3, 3)\nprint(A2, '\\n', B2)\n\n\n# #### 18. Matrix addition\n\n\nA + B\n\n\nA2+B2\n\n\n# #### 19. Matrix subtraction\n\n\nA - B\n\n\nA2-B2\n\n\n# #### 20. Multiplication between matrix elements\n\n\nA * B\n\n\nA2*B2\n\n\n# #### 21. Matrix multiplication (Note the difference between 21 and 20)\n\n\nnp.dot(A, B)\n\n\nnp.dot(A2, B2)\n\n\n# If you use np.mat to accurately define a two-dimensional array as a matrix, you can use *\n# to complete the matrix multiplication.\nnp.mat(A) * np.mat(B)\n\n\nnp.mat(A2)*np.mat(B2)\n\n\n# #### 22. Multiplication between matrix and number\n\n\n2 * A\n\n\n2.5*A2\n\n\n# #### 23. Transpose of matrix\n\n\nA.T\n\n\nA2.T\n\n\n# #### 24. Matrix inversion\n\n\nnp.linalg.inv(A)\n\n\nnp.linalg.inv(A2)\n\n\n# ### Mathematical Functions\n\n# #### 25. Trigonometric functions\n\n\nprint(a)\n\nnp.sin(a)\n\n\nprint(a2)\nnp.cos(a2)\n\n\nnp.sin((1/2)*np.pi)\n\n\n\n", "project_metadata": {"full_name": "zahta/path2ml", "description": "I am studying Data Science. In this repository, I have written about my experiences in studying Machine learning. Also, I have included the solutions of some theoretical and practical Machine learning exercises.", "topics": ["machine-learning", "python", "data-science", "anaconda", "study", "assignment", "exercise", "git", "colab", "conda", "jupyter-lab", "jupyter-notebook", "resources"], "git_url": "git://github.com/zahta/path2ml.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-02-09T15:38:08Z", "size": 18814, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10354491, "Python": 4073}, "last_updated": "2020-12-12T12:50:43Z"}, "intent": "# 26. Exponential function based on natural logarithm function"}, {"original_comment": "# ### Fully-Connected Layer\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.utils import shuffle\nimport cv2\nimport glob\nimport random\nimport math\nfrom datetime import timedelta\nimport time\nfrom sklearn.metrics import confusion_matrix\nimport numpy as np\nimport pandas as pd\nimport tensorflow as tf\nimport matplotlib.pyplot as plt\nimport os\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Function of loading dataset\n\n#%%\n\ndef load_train(train_path, image_size, classes):\n images = []\n labels = []\n ids = []\n cls = []\n\n print('Reading training images')\n for fld in classes: # assuming data directory has a separate folder for each class, and that each folder is named after the class\n index = classes.index(fld)\n print('Loading {} files (Index: {})'.format(fld, index))\n path = os.path.join(train_path, fld, '*g')\n files = glob.glob(path)\n for fl in files:\n image = cv2.imread(fl)\n image = cv2.resize(image, (image_size, image_size),\n interpolation=cv2.INTER_LINEAR)\n images.append(image)\n label = np.zeros(len(classes))\n label[index] = 1.0\n labels.append(label)\n flbase = os.path.basename(fl)\n ids.append(flbase)\n cls.append(fld)\n images = np.array(images)\n labels = np.array(labels)\n ids = np.array(ids)\n cls = np.array(cls)\n\n return images, labels, ids, cls\n\n\ndef load_test(test_path, image_size):\n path = os.path.join(test_path, '*g')\n files = sorted(glob.glob(path))\n\n X_test = []\n X_test_id = []\n print(\"Reading test images\")\n for fl in files:\n flbase = os.path.basename(fl)\n img = cv2.imread(fl)\n img = cv2.resize(img, (image_size, image_size),\n interpolation=cv2.INTER_LINEAR)\n X_test.append(img)\n X_test_id.append(flbase)\n\n# because we're not creating a DataSet object for the test images, normalization happens here\n X_test = np.array(X_test, dtype=np.uint8)\n X_test = X_test.astype('float32')\n X_test = X_test / 255\n\n return X_test, X_test_id\n\n\nclass DataSet(object):\n\n def __init__(self, images, labels, ids, cls):\n \"\"\"Construct a DataSet. one_hot arg is used only if fake_data is true.\"\"\"\n\n self._num_examples = images.shape[0]\n\n # Convert shape from [num examples, rows, columns, depth]\n # to [num examples, rows*columns] (assuming depth == 1)\n # Convert from [0, 255] -> [0.0, 1.0].\n\n images = images.astype(np.float32)\n images = np.multiply(images, 1.0 / 255.0)\n\n self._images = images\n self._labels = labels\n self._ids = ids\n self._cls = cls\n self._epochs_completed = 0\n self._index_in_epoch = 0\n\n @property\n def images(self):\n return self._images\n\n @property\n def labels(self):\n return self._labels\n\n @property\n def ids(self):\n return self._ids\n\n @property\n def cls(self):\n return self._cls\n\n @property\n def num_examples(self):\n return self._num_examples\n\n @property\n def epochs_completed(self):\n return self._epochs_completed\n\n def next_batch(self, batch_size):\n \"\"\"Return the next `batch_size` examples from this data set.\"\"\"\n start = self._index_in_epoch\n self._index_in_epoch += batch_size\n\n if self._index_in_epoch > self._num_examples:\n # Finished epoch\n self._epochs_completed += 1\n\n # # Shuffle the data (maybe)\n # perm = np.arange(self._num_examples)\n # np.random.shuffle(perm)\n # self._images = self._images[perm]\n # self._labels = self._labels[perm]\n # Start next epoch\n\n start = 0\n self._index_in_epoch = batch_size\n assert batch_size <= self._num_examples\n end = self._index_in_epoch\n\n return self._images[start:end], self._labels[start:end], self._ids[start:end], self._cls[start:end]\n\n\ndef read_train_sets(train_path, image_size, classes, validation_size=0):\n class DataSets(object):\n pass\n data_sets = DataSets()\n\n images, labels, ids, cls = load_train(train_path, image_size, classes)\n images, labels, ids, cls = shuffle(\n images, labels, ids, cls) # shuffle the data\n\n if isinstance(validation_size, float):\n validation_size = int(validation_size * images.shape[0])\n\n validation_images = images[:validation_size]\n validation_labels = labels[:validation_size]\n validation_ids = ids[:validation_size]\n validation_cls = cls[:validation_size]\n\n train_images = images[validation_size:]\n train_labels = labels[validation_size:]\n train_ids = ids[validation_size:]\n train_cls = cls[validation_size:]\n\n data_sets.train = DataSet(\n train_images, train_labels, train_ids, train_cls)\n data_sets.valid = DataSet(\n validation_images, validation_labels, validation_ids, validation_cls)\n\n return data_sets\n\n\ndef read_test_set(test_path, image_size):\n images, ids = load_test(test_path, image_size)\n return images, ids\n\n\n# ## Configuration and Hyperparameters\n\n#%%\n\n# Convolutional Layer 1.\nfilter_size1 = 5\nnum_filters1 = 64\n\n# Convolutional Layer 2.\nfilter_size2 = 3\nnum_filters2 = 64\n\n# # Convolutional Layer 3.\n# filter_size3 = 5\n# num_filters3 = 128\n\n# Fully-connected layer 1.\nfc1_size = 128 # Number of neurons in fully-connected layer.\n\n# Fully-connected layer 2.\nfc2_size = 128 # Number of neurons in fully-connected layer.\n\n# Number of color channels for the images: 1 channel for gray-scale.\nnum_channels = 3\n\n# image dimensions (only squares for now)\nimg_size = 64\n\n# Size of image when flattened to a single dimension\nimg_size_flat = img_size * img_size * num_channels\n\n# Tuple with height and width of images used to reshape arrays.\nimg_shape = (img_size, img_size)\n\n# class info\nclasses = ['Sphynx', 'Siamese', 'Ragdoll',\n 'Persian', 'Maine_Coon', 'British_shorthair', 'Bombay', 'Birman', 'Bengal', 'Abyssinian']\n\n# classes = ['Sphynx','Siamese',\n# 'Persian','Maine_Coon','British_shorthair']\n\nnum_classes = len(classes)\n\n# batch size\nbatch_size = 32\n\n# validation split\nvalidation_size = .2\n\n# how long to wait after validation loss stops improving before terminating training\nearly_stopping = None # use None if you don't want to implement early stoping\n\ntrain_path = 'dataset'\n# test_path = 'test'\ncheckpoint_dir = \"ckpoint\"\n\n#%%\n\n\n\n#%%\n\n# load training dataset\ndata = read_train_sets(train_path, img_size, classes,\n validation_size=validation_size)\n# test_images, test_ids = read_test_set(test_path, img_size)\n\n#%%\n\nprint(\"Size of:\")\nprint(\"- Training-set:\\t\\t{}\".format(len(data.train.labels)))\n# print(\"- Test-set:\\t\\t{}\".format(len(test_images)))\nprint(\"- Validation:\\t{}\".format(len(data.valid.labels)))\n# print(images)\n\n\n# ### Helper-function for plotting images\n\n#%%\n\ndef plot_images(images, cls_true, cls_pred=None):\n\n if len(images) == 0:\n print(\"no images to show\")\n return\n else:\n random_indices = random.sample(range(len(images)), min(len(images), 9))\n\n images, cls_true = zip(*[(images[i], cls_true[i]) for i in random_indices])\n\n # Create figure with 3x3 sub-plots.\n fig, axes = plt.subplots(3, 3)\n fig.subplots_adjust(hspace=0.3, wspace=0.3)\n\n for i, ax in enumerate(axes.flat):\n # Plot image.\n ax.imshow(images[i].reshape(img_size, img_size, num_channels))\n\n # Show true and predicted classes.\n if cls_pred is None:\n xlabel = \"True: {0}\".format(cls_true[i])\n else:\n xlabel = \"True: {0}, Pred: {1}\".format(cls_true[i], cls_pred[i])\n\n # Show the classes as the label on the x-axis.\n ax.set_xlabel(xlabel)\n\n # Remove ticks from the plot.\n ax.set_xticks([])\n ax.set_yticks([])\n\n # Ensure the plot is shown correctly with multiple plots\n # in a single Notebook cell.\n plt.show()\n\n#%%\n\n# Get some random images and their labels from the train set.\n\nimages, cls_true = data.train.images, data.train.cls\n\n# Plot the images and labels using our helper-function above.\nplot_images(images=images, cls_true=cls_true)\n\n\n# ## TensorFlow Graph\n\n# ### Helper-functions for creating new variables\n\n#%%\n\ndef new_weights(shape):\n return tf.Variable(tf.truncated_normal(shape, stddev=0.05))\n\n\ndef new_biases(length):\n return tf.Variable(tf.constant(0.05, shape=[length]))\n\n\n# ### Convolutional Layer\n\n#%%\n\ndef new_conv_layer(input, # The previous layer.\n num_input_channels, # Num. channels in prev. layer.\n filter_size, # Width and height of each filter.\n num_filters, # Number of filters.\n use_pooling=True): # Use 2x2 max-pooling.\n\n # Shape of the filter-weights for the convolution.\n # This format is determined by the TensorFlow API.\n shape = [filter_size, filter_size, num_input_channels, num_filters]\n\n # Create new weights aka. filters with the given shape.\n weights = new_weights(shape=shape)\n\n # Create new biases, one for each filter.\n biases = new_biases(length=num_filters)\n\n # Create the TensorFlow operation for convolution.\n # Note the strides are set to 1 in all dimensions.\n # The first and last stride must always be 1,\n # because the first is for the image-number and\n # the last is for the input-channel.\n # But e.g. strides=[1, 2, 2, 1] would mean that the filter\n # is moved 2 pixels across the x- and y-axis of the image.\n # The padding is set to 'SAME' which means the input image\n # is padded with zeroes so the size of the output is the same.\n layer = tf.nn.conv2d(input=input,\n filter=weights,\n strides=[1, 1, 1, 1],\n padding='SAME')\n\n # Add the biases to the results of the convolution.\n # A bias-value is added to each filter-channel.\n layer += biases\n\n # Use pooling to down-sample the image resolution?\n if use_pooling:\n # This is 2x2 max-pooling, which means that we\n # consider 2x2 windows and select the largest value\n # in each window. Then we move 2 pixels to the next window.\n layer = tf.nn.max_pool(value=layer,\n ksize=[1, 2, 2, 1],\n strides=[1, 2, 2, 1],\n padding='SAME')\n\n # Rectified Linear Unit (ReLU).\n # It calculates max(x, 0) for each input pixel x.\n # This adds some non-linearity to the formula and allows us\n # to learn more complicated functions.\n layer = tf.nn.relu(layer)\n\n # Note that ReLU is normally executed before the pooling,\n # but since relu(max_pool(x)) == max_pool(relu(x)) we can\n # save 75% of the relu-operations by max-pooling first.\n\n # We return both the resulting layer and the filter-weights\n # because we will plot the weights later.\n return layer, weights\n\n\n# ### Flattening a layer\n\n#%%\n\ndef flatten_layer(layer):\n # Get the shape of the input layer.\n layer_shape = layer.get_shape()\n\n # The shape of the input layer is assumed to be:\n # layer_shape == [num_images, img_height, img_width, num_channels]\n\n # The number of features is: img_height * img_width * num_channels\n # We can use a function from TensorFlow to calculate this.\n num_features = layer_shape[1:4].num_elements()\n\n # Reshape the layer to [num_images, num_features].\n # Note that we just set the size of the second dimension\n # to num_features and the size of the first dimension to -1\n # which means the size in that dimension is calculated\n # so the total size of the tensor is unchanged from the reshaping.\n layer_flat = tf.reshape(layer, [-1, num_features])\n\n # The shape of the flattened layer is now:\n # [num_images, img_height * img_width * num_channels]\n\n # Return both the flattened layer and the number of features.\n return layer_flat, num_features", "target_code": "def new_fc_layer(input, # The previous layer.\n num_inputs, # Num. inputs from prev. layer.\n num_outputs, # Num. outputs.\n use_relu=True): # Use Rectified Linear Unit (ReLU)?\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.utils import shuffle\nimport cv2\nimport glob\nimport random\nimport math\nfrom datetime import timedelta\nimport time\nfrom sklearn.metrics import confusion_matrix\nimport numpy as np\nimport pandas as pd\nimport tensorflow as tf\nimport matplotlib.pyplot as plt\nimport os\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Function of loading dataset\n\n\ndef load_train(train_path, image_size, classes):\n images = []\n labels = []\n ids = []\n cls = []\n\n print('Reading training images')\n for fld in classes: # assuming data directory has a separate folder for each class, and that each folder is named after the class\n index = classes.index(fld)\n print('Loading {} files (Index: {})'.format(fld, index))\n path = os.path.join(train_path, fld, '*g')\n files = glob.glob(path)\n for fl in files:\n image = cv2.imread(fl)\n image = cv2.resize(image, (image_size, image_size),\n interpolation=cv2.INTER_LINEAR)\n images.append(image)\n label = np.zeros(len(classes))\n label[index] = 1.0\n labels.append(label)\n flbase = os.path.basename(fl)\n ids.append(flbase)\n cls.append(fld)\n images = np.array(images)\n labels = np.array(labels)\n ids = np.array(ids)\n cls = np.array(cls)\n\n return images, labels, ids, cls\n\n\ndef load_test(test_path, image_size):\n path = os.path.join(test_path, '*g')\n files = sorted(glob.glob(path))\n\n X_test = []\n X_test_id = []\n print(\"Reading test images\")\n for fl in files:\n flbase = os.path.basename(fl)\n img = cv2.imread(fl)\n img = cv2.resize(img, (image_size, image_size),\n interpolation=cv2.INTER_LINEAR)\n X_test.append(img)\n X_test_id.append(flbase)\n\n# because we're not creating a DataSet object for the test images, normalization happens here\n X_test = np.array(X_test, dtype=np.uint8)\n X_test = X_test.astype('float32')\n X_test = X_test / 255\n\n return X_test, X_test_id\n\n\nclass DataSet(object):\n\n def __init__(self, images, labels, ids, cls):\n \"\"\"Construct a DataSet. one_hot arg is used only if fake_data is true.\"\"\"\n\n self._num_examples = images.shape[0]\n\n # Convert shape from [num examples, rows, columns, depth]\n # to [num examples, rows*columns] (assuming depth == 1)\n # Convert from [0, 255] -> [0.0, 1.0].\n\n images = images.astype(np.float32)\n images = np.multiply(images, 1.0 / 255.0)\n\n self._images = images\n self._labels = labels\n self._ids = ids\n self._cls = cls\n self._epochs_completed = 0\n self._index_in_epoch = 0\n\n @property\n def images(self):\n return self._images\n\n @property\n def labels(self):\n return self._labels\n\n @property\n def ids(self):\n return self._ids\n\n @property\n def cls(self):\n return self._cls\n\n @property\n def num_examples(self):\n return self._num_examples\n\n @property\n def epochs_completed(self):\n return self._epochs_completed\n\n def next_batch(self, batch_size):\n \"\"\"Return the next `batch_size` examples from this data set.\"\"\"\n start = self._index_in_epoch\n self._index_in_epoch += batch_size\n\n if self._index_in_epoch > self._num_examples:\n # Finished epoch\n self._epochs_completed += 1\n\n # # Shuffle the data (maybe)\n # perm = np.arange(self._num_examples)\n # np.random.shuffle(perm)\n # self._images = self._images[perm]\n # self._labels = self._labels[perm]\n # Start next epoch\n\n start = 0\n self._index_in_epoch = batch_size\n assert batch_size <= self._num_examples\n end = self._index_in_epoch\n\n return self._images[start:end], self._labels[start:end], self._ids[start:end], self._cls[start:end]\n\n\ndef read_train_sets(train_path, image_size, classes, validation_size=0):\n class DataSets(object):\n pass\n data_sets = DataSets()\n\n images, labels, ids, cls = load_train(train_path, image_size, classes)\n images, labels, ids, cls = shuffle(\n images, labels, ids, cls) # shuffle the data\n\n if isinstance(validation_size, float):\n validation_size = int(validation_size * images.shape[0])\n\n validation_images = images[:validation_size]\n validation_labels = labels[:validation_size]\n validation_ids = ids[:validation_size]\n validation_cls = cls[:validation_size]\n\n train_images = images[validation_size:]\n train_labels = labels[validation_size:]\n train_ids = ids[validation_size:]\n train_cls = cls[validation_size:]\n\n data_sets.train = DataSet(\n train_images, train_labels, train_ids, train_cls)\n data_sets.valid = DataSet(\n validation_images, validation_labels, validation_ids, validation_cls)\n\n return data_sets\n\n\ndef read_test_set(test_path, image_size):\n images, ids = load_test(test_path, image_size)\n return images, ids\n\n\n# ## Configuration and Hyperparameters\n\n\n# Convolutional Layer 1.\nfilter_size1 = 5\nnum_filters1 = 64\n\n# Convolutional Layer 2.\nfilter_size2 = 3\nnum_filters2 = 64\n\n# # Convolutional Layer 3.\n# filter_size3 = 5\n# num_filters3 = 128\n\n# Fully-connected layer 1.\nfc1_size = 128 # Number of neurons in fully-connected layer.\n\n# Fully-connected layer 2.\nfc2_size = 128 # Number of neurons in fully-connected layer.\n\n# Number of color channels for the images: 1 channel for gray-scale.\nnum_channels = 3\n\n# image dimensions (only squares for now)\nimg_size = 64\n\n# Size of image when flattened to a single dimension\nimg_size_flat = img_size * img_size * num_channels\n\n# Tuple with height and width of images used to reshape arrays.\nimg_shape = (img_size, img_size)\n\n# class info\nclasses = ['Sphynx', 'Siamese', 'Ragdoll',\n 'Persian', 'Maine_Coon', 'British_shorthair', 'Bombay', 'Birman', 'Bengal', 'Abyssinian']\n\n# classes = ['Sphynx','Siamese',\n# 'Persian','Maine_Coon','British_shorthair']\n\nnum_classes = len(classes)\n\n# batch size\nbatch_size = 32\n\n# validation split\nvalidation_size = .2\n\n# how long to wait after validation loss stops improving before terminating training\nearly_stopping = None # use None if you don't want to implement early stoping\n\ntrain_path = 'dataset'\n# test_path = 'test'\ncheckpoint_dir = \"ckpoint\"\n\n\n\n\n\n# load training dataset\ndata = read_train_sets(train_path, img_size, classes,\n validation_size=validation_size)\n# test_images, test_ids = read_test_set(test_path, img_size)\n\n\nprint(\"Size of:\")\nprint(\"- Training-set:\\t\\t{}\".format(len(data.train.labels)))\n# print(\"- Test-set:\\t\\t{}\".format(len(test_images)))\nprint(\"- Validation:\\t{}\".format(len(data.valid.labels)))\n# print(images)\n\n\n# ### Helper-function for plotting images\n\n\ndef plot_images(images, cls_true, cls_pred=None):\n\n if len(images) == 0:\n print(\"no images to show\")\n return\n else:\n random_indices = random.sample(range(len(images)), min(len(images), 9))\n\n images, cls_true = zip(*[(images[i], cls_true[i]) for i in random_indices])\n\n # Create figure with 3x3 sub-plots.\n fig, axes = plt.subplots(3, 3)\n fig.subplots_adjust(hspace=0.3, wspace=0.3)\n\n for i, ax in enumerate(axes.flat):\n # Plot image.\n ax.imshow(images[i].reshape(img_size, img_size, num_channels))\n\n # Show true and predicted classes.\n if cls_pred is None:\n xlabel = \"True: {0}\".format(cls_true[i])\n else:\n xlabel = \"True: {0}, Pred: {1}\".format(cls_true[i], cls_pred[i])\n\n # Show the classes as the label on the x-axis.\n ax.set_xlabel(xlabel)\n\n # Remove ticks from the plot.\n ax.set_xticks([])\n ax.set_yticks([])\n\n # Ensure the plot is shown correctly with multiple plots\n # in a single Notebook cell.\n plt.show()\n\n\n# Get some random images and their labels from the train set.\n\nimages, cls_true = data.train.images, data.train.cls\n\n# Plot the images and labels using our helper-function above.\nplot_images(images=images, cls_true=cls_true)\n\n\n# ## TensorFlow Graph\n\n# ### Helper-functions for creating new variables\n\n\ndef new_weights(shape):\n return tf.Variable(tf.truncated_normal(shape, stddev=0.05))\n\n\ndef new_biases(length):\n return tf.Variable(tf.constant(0.05, shape=[length]))\n\n\n# ### Convolutional Layer\n\n\ndef new_conv_layer(input, # The previous layer.\n num_input_channels, # Num. channels in prev. layer.\n filter_size, # Width and height of each filter.\n num_filters, # Number of filters.\n use_pooling=True): # Use 2x2 max-pooling.\n\n # Shape of the filter-weights for the convolution.\n # This format is determined by the TensorFlow API.\n shape = [filter_size, filter_size, num_input_channels, num_filters]\n\n # Create new weights aka. filters with the given shape.\n weights = new_weights(shape=shape)\n\n # Create new biases, one for each filter.\n biases = new_biases(length=num_filters)\n\n # Create the TensorFlow operation for convolution.\n # Note the strides are set to 1 in all dimensions.\n # The first and last stride must always be 1,\n # because the first is for the image-number and\n # the last is for the input-channel.\n # But e.g. strides=[1, 2, 2, 1] would mean that the filter\n # is moved 2 pixels across the x- and y-axis of the image.\n # The padding is set to 'SAME' which means the input image\n # is padded with zeroes so the size of the output is the same.\n layer = tf.nn.conv2d(input=input,\n filter=weights,\n strides=[1, 1, 1, 1],\n padding='SAME')\n\n # Add the biases to the results of the convolution.\n # A bias-value is added to each filter-channel.\n layer += biases\n\n # Use pooling to down-sample the image resolution?\n if use_pooling:\n # This is 2x2 max-pooling, which means that we\n # consider 2x2 windows and select the largest value\n # in each window. Then we move 2 pixels to the next window.\n layer = tf.nn.max_pool(value=layer,\n ksize=[1, 2, 2, 1],\n strides=[1, 2, 2, 1],\n padding='SAME')\n\n # Rectified Linear Unit (ReLU).\n # It calculates max(x, 0) for each input pixel x.\n # This adds some non-linearity to the formula and allows us\n # to learn more complicated functions.\n layer = tf.nn.relu(layer)\n\n # Note that ReLU is normally executed before the pooling,\n # but since relu(max_pool(x)) == max_pool(relu(x)) we can\n # save 75% of the relu-operations by max-pooling first.\n\n # We return both the resulting layer and the filter-weights\n # because we will plot the weights later.\n return layer, weights\n\n\n# ### Flattening a layer\n\n\ndef flatten_layer(layer):\n # Get the shape of the input layer.\n layer_shape = layer.get_shape()\n\n # The shape of the input layer is assumed to be:\n # layer_shape == [num_images, img_height, img_width, num_channels]\n\n # The number of features is: img_height * img_width * num_channels\n # We can use a function from TensorFlow to calculate this.\n num_features = layer_shape[1:4].num_elements()\n\n # Reshape the layer to [num_images, num_features].\n # Note that we just set the size of the second dimension\n # to num_features and the size of the first dimension to -1\n # which means the size in that dimension is calculated\n # so the total size of the tensor is unchanged from the reshaping.\n layer_flat = tf.reshape(layer, [-1, num_features])\n\n # The shape of the flattened layer is now:\n # [num_images, img_height * img_width * num_channels]\n\n # Return both the flattened layer and the number of features.\n return layer_flat, num_features\n\n\n\n", "project_metadata": {"full_name": "BenJamesbabala/Image_Classification_with_5_methods", "description": "Compared performance of KNN, SVM, BPNN, CNN, Transfer Learning (retrain on Inception v3) on image classification problem. CNN is implemented with TensorFlow", "topics": [], "git_url": "git://github.com/BenJamesbabala/Image_Classification_with_5_methods.git", "stars": 27, "watchers": 27, "forks": 140, "created": "2017-05-14T13:59:37Z", "size": 88429, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 150845, "Python": 6537}, "last_updated": "2020-12-29T17:52:57Z"}, "intent": "# Fully-Connected Layer"}, {"original_comment": "# * A stacked histogram:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n# \n# Computational Finance and FinTech

\n# M.Sc. International Finance\n#
\n#
\n#

\n# \n#

\n# \n# \n# Prof. Dr. Natalie Packham
\n# Berlin School of Economics and Law
\n# Summer Term 2019\n#
\n#
\n\n#

Table of Contents

\n# \n\n# # Data Visualisation\n\n# * Further reading: __Py4Fi, Chapter 7__\n# * We look at the [`matplotlib`](http://www.matplotlib.org) plotting libraries.\n# * Interactive 2D plotting is available with [`plotly`](http://plot.ly).\n# * More information on `plotly` and some introductory examples can be found in the __Python for Finance__ book.\n\n# ## Static 2D Plotting\n#\n# * Some standard imports and customatisations:\n\n#%%\n\nfrom mpl_toolkits.mplot3d import Axes3D\nimport numpy as np\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt # main plotting subpackage\nplt.style.use('seaborn') # sets the plotting style\nmpl.rcParams['font.family'] = 'serif' # set the font to be serif in all plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ensures that output of plotting command is displayed within notebook\n\n#%%\n\nmpl.__version__ # the version of matplotlib\n\n\n# ### Simple plotting\n# * The standard (and powerful) plotting method is `plt.plot()`.\n# * It takes as basic argument lists or arrays of $x$ values and $y$ values\n\n#%%\n\nnp.random.seed(1000)\ny = np.random.standard_normal(20) # draw some random numbers\nx = np.arange(len(y)) # fix the x axis\nplt.plot(x, y) # plot y against x\n\n\n# ### Simple plotting\n# * A number functions are available to customise the plot:\n\n#%%\n\nplt.plot(y.cumsum())\nplt.grid(False)\n\n\n# ### Simple plotting\n# * Options for `plt.axis()`:\n#\n# ![Options for plt.axis](pics/plt_axis.png)\n#\n#\n#
Source: Python for Finance, 2nd ed.
\n\n# ### Simple plotting\n# * Further customisations:\n\n#%%\n\nplt.figure(figsize=(8, 5)) # increase size of figure\nplt.plot(y.cumsum(), 'b', lw=1.5) # plot data in blue with a line width of 1.5\nplt.plot(y.cumsum(), 'ro') # plot the data points as red dots\nplt.xlabel('index') # label of x-axis\nplt.ylabel('value') # label of y-axis\nplt.title('A Simple Plot') # plot title\n\n\n# ### Simple plotting\n# * Standard colour abbreviations:\n#\n# Character | Colour\n# ----------| -----------\n# b | blue\n# g | green\n# r | red\n# c | cyan\n# m | magenta\n# y | yellow\n# k | black\n# w | white\n\n# ### Simple plotting\n# * Line styles:\n#\n# Character | Colour\n# ----------| -----------\n# `'-'` | solid line\n# `'--'` | dashed line\n# `'-.` | dash-dot line\n# `':'` | dotted line\n\n# ### Simple plotting\n# * Some marker styles:\n#\n# Character | Colour\n# ----------| -----------\n# `'.'` | point\n# `','` | pixel\n# `'o` | circle\n# `'v'` | triangle down\n# `'^'` | triangle up\n# `'<'` | triangle left\n# `'>'` | triangle right\n# `'*'` | star\n# `'h'` | hexagon\n#\n# * More marker styles are found [here](https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html) and [here](https://matplotlib.org/api/markers_api.html).\n\n# ### Plotting several data sets\n# * If the data are arranged in a multi-dimensional array, then `plot()` will automatically plot the columns separately:\n\n#%%\n\ny = np.random.standard_normal((20, 2)).cumsum(axis=0)\nplt.figure(figsize=(6, 3))\n# define a label to be used in the legend\nplt.plot(y[:, 0], lw=1.5, label='1st')\nplt.plot(y[:, 1], lw=1.5, label='2nd')\nplt.plot(y, 'ro')\n# add a legend, consult the legend help to find out about locations\nplt.legend(loc=0)\nplt.xlabel('index')\nplt.ylabel('value')\nplt.title('A Simple Plot')\n\n\n# ### Subplots\n# * `plt.subplots()` is a powerful method to either combine several plots with separate axes or to produce separate plots.\n# * In the first example, the plots overlay each other:\n\n# ### Subplots\n\n#%%\n\ny[:, 0] = y[:, 0] * 100\n\n#%%\n\nfig, ax1 = plt.subplots() # defines figure and axis objects\nplt.plot(y[:, 0], 'b', lw=1.5, label='1st')\nplt.plot(y[:, 0], 'ro')\nplt.legend(loc=8)\nplt.xlabel('index')\nplt.ylabel('value 1st')\nplt.title('A Simple Plot')\nax2 = ax1.twinx() # create a second y-axis object\nplt.plot(y[:, 1], 'g', lw=1.5, label='2nd')\nplt.plot(y[:, 1], 'ro')\nplt.legend(loc=0)\nplt.ylabel('value 2nd')\n\n\n# ### Subplots\n# * The second example creates two separate plots.\n# * The main argument to `subplot()` is a 3-digit integer describing the position of the subplot.\n# * The integers refer to `nrows`, `ncols` and `index`, where `index` starts at 1 in the upper left corner and increases to the right.\n\n# ### Subplots\n\n#%%\n\nplt.figure(figsize=(6, 3))\n# defines the upper plot in a figure with two rows and one column\nplt.subplot(211)\nplt.plot(y[:, 0], lw=1.5, label='1st')\nplt.plot(y[:, 0], 'ro')\nplt.legend(loc=0)\nplt.ylabel('value')\nplt.title('A Simple Plot')\nplt.subplot(212) # defines the lower plot\nplt.plot(y[:, 1], 'g', lw=1.5, label='2nd')\nplt.plot(y[:, 1], 'ro')\nplt.legend(loc=0)\nplt.xlabel('index')\nplt.ylabel('value')\n\n\n# ### Other plot styles\n# * The following examples introduce bar charts, scatter plots, histograms and boxplots\n\n# ### Bar chart\n\n#%%\n\nplt.bar(np.arange(len(y)), abs(y[:, 1]), width=0.5,\n color='g')\nplt.xlabel('index')\nplt.title('Bar chart')\n\n\n# ### Scatter plot\n\n#%%\n\ny = np.random.standard_normal((1000, 2))\n\n#%%\n\nplt.scatter(y[:, 0], y[:, 1], marker='o')\nplt.xlabel('1st')\nplt.ylabel('2nd')\nplt.title('Scatter Plot')\n\n\n# ### Scatter plot\n# * Adding a third dimension via a colour map:\n\n#%%\n\nc = np.random.randint(0, 10, len(y))\n\n#%%\n\nplt.scatter(y[:, 0], y[:, 1],\n c=c,\n cmap='coolwarm',\n marker='o')\nplt.colorbar()\nplt.xlabel('1st')\nplt.ylabel('2nd')\nplt.title('Scatter Plot')\n\n\n# ### Histogram\n\n#%%\n\nplt.hist(y, label=['1st', '2nd'], bins=25)\nplt.legend(loc=0)\nplt.xlabel('value')\nplt.ylabel('frequency')\nplt.title('Histogram')\n\n\n# ### Histogram\n# * Parameters for `plt.hist()`:\n#\n# ![Parameters for plt.hist](pics/plt_hist_1.png)\n#\n# ![Parameters for plt.hist](pics/plt_hist_2.png)\n#\n#\n#
Source: Python for Finance, 2nd ed.
\n\n# ### Histogram", "target_code": "plt.hist(y, label=['1st', '2nd'], color=['b', 'g'],\n stacked=True, bins=20, alpha=0.5)\nplt.legend(loc=0)\nplt.xlabel('value')\nplt.ylabel('frequency')\nplt.title('Histogram')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n# \n# Computational Finance and FinTech

\n# M.Sc. International Finance\n#
\n#
\n#

\n# \n#

\n# \n# \n# Prof. Dr. Natalie Packham
\n# Berlin School of Economics and Law
\n# Summer Term 2019\n#
\n#
\n\n#

Table of Contents

\n# \n\n# # Data Visualisation\n\n# * Further reading: __Py4Fi, Chapter 7__\n# * We look at the [`matplotlib`](http://www.matplotlib.org) plotting libraries.\n# * Interactive 2D plotting is available with [`plotly`](http://plot.ly).\n# * More information on `plotly` and some introductory examples can be found in the __Python for Finance__ book.\n\n# ## Static 2D Plotting\n#\n# * Some standard imports and customatisations:\n\n\nfrom mpl_toolkits.mplot3d import Axes3D\nimport numpy as np\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt # main plotting subpackage\nplt.style.use('seaborn') # sets the plotting style\nmpl.rcParams['font.family'] = 'serif' # set the font to be serif in all plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ensures that output of plotting command is displayed within notebook\n\n\nmpl.__version__ # the version of matplotlib\n\n\n# ### Simple plotting\n# * The standard (and powerful) plotting method is `plt.plot()`.\n# * It takes as basic argument lists or arrays of $x$ values and $y$ values\n\n\nnp.random.seed(1000)\ny = np.random.standard_normal(20) # draw some random numbers\nx = np.arange(len(y)) # fix the x axis\nplt.plot(x, y) # plot y against x\n\n\n# ### Simple plotting\n# * A number functions are available to customise the plot:\n\n\nplt.plot(y.cumsum())\nplt.grid(False)\n\n\n# ### Simple plotting\n# * Options for `plt.axis()`:\n#\n# ![Options for plt.axis](pics/plt_axis.png)\n#\n#\n#
Source: Python for Finance, 2nd ed.
\n\n# ### Simple plotting\n# * Further customisations:\n\n\nplt.figure(figsize=(8, 5)) # increase size of figure\nplt.plot(y.cumsum(), 'b', lw=1.5) # plot data in blue with a line width of 1.5\nplt.plot(y.cumsum(), 'ro') # plot the data points as red dots\nplt.xlabel('index') # label of x-axis\nplt.ylabel('value') # label of y-axis\nplt.title('A Simple Plot') # plot title\n\n\n# ### Simple plotting\n# * Standard colour abbreviations:\n#\n# Character | Colour\n# ----------| -----------\n# b | blue\n# g | green\n# r | red\n# c | cyan\n# m | magenta\n# y | yellow\n# k | black\n# w | white\n\n# ### Simple plotting\n# * Line styles:\n#\n# Character | Colour\n# ----------| -----------\n# `'-'` | solid line\n# `'--'` | dashed line\n# `'-.` | dash-dot line\n# `':'` | dotted line\n\n# ### Simple plotting\n# * Some marker styles:\n#\n# Character | Colour\n# ----------| -----------\n# `'.'` | point\n# `','` | pixel\n# `'o` | circle\n# `'v'` | triangle down\n# `'^'` | triangle up\n# `'<'` | triangle left\n# `'>'` | triangle right\n# `'*'` | star\n# `'h'` | hexagon\n#\n# * More marker styles are found [here](https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html) and [here](https://matplotlib.org/api/markers_api.html).\n\n# ### Plotting several data sets\n# * If the data are arranged in a multi-dimensional array, then `plot()` will automatically plot the columns separately:\n\n\ny = np.random.standard_normal((20, 2)).cumsum(axis=0)\nplt.figure(figsize=(6, 3))\n# define a label to be used in the legend\nplt.plot(y[:, 0], lw=1.5, label='1st')\nplt.plot(y[:, 1], lw=1.5, label='2nd')\nplt.plot(y, 'ro')\n# add a legend, consult the legend help to find out about locations\nplt.legend(loc=0)\nplt.xlabel('index')\nplt.ylabel('value')\nplt.title('A Simple Plot')\n\n\n# ### Subplots\n# * `plt.subplots()` is a powerful method to either combine several plots with separate axes or to produce separate plots.\n# * In the first example, the plots overlay each other:\n\n# ### Subplots\n\n\ny[:, 0] = y[:, 0] * 100\n\n\nfig, ax1 = plt.subplots() # defines figure and axis objects\nplt.plot(y[:, 0], 'b', lw=1.5, label='1st')\nplt.plot(y[:, 0], 'ro')\nplt.legend(loc=8)\nplt.xlabel('index')\nplt.ylabel('value 1st')\nplt.title('A Simple Plot')\nax2 = ax1.twinx() # create a second y-axis object\nplt.plot(y[:, 1], 'g', lw=1.5, label='2nd')\nplt.plot(y[:, 1], 'ro')\nplt.legend(loc=0)\nplt.ylabel('value 2nd')\n\n\n# ### Subplots\n# * The second example creates two separate plots.\n# * The main argument to `subplot()` is a 3-digit integer describing the position of the subplot.\n# * The integers refer to `nrows`, `ncols` and `index`, where `index` starts at 1 in the upper left corner and increases to the right.\n\n# ### Subplots\n\n\nplt.figure(figsize=(6, 3))\n# defines the upper plot in a figure with two rows and one column\nplt.subplot(211)\nplt.plot(y[:, 0], lw=1.5, label='1st')\nplt.plot(y[:, 0], 'ro')\nplt.legend(loc=0)\nplt.ylabel('value')\nplt.title('A Simple Plot')\nplt.subplot(212) # defines the lower plot\nplt.plot(y[:, 1], 'g', lw=1.5, label='2nd')\nplt.plot(y[:, 1], 'ro')\nplt.legend(loc=0)\nplt.xlabel('index')\nplt.ylabel('value')\n\n\n# ### Other plot styles\n# * The following examples introduce bar charts, scatter plots, histograms and boxplots\n\n# ### Bar chart\n\n\nplt.bar(np.arange(len(y)), abs(y[:, 1]), width=0.5,\n color='g')\nplt.xlabel('index')\nplt.title('Bar chart')\n\n\n# ### Scatter plot\n\n\ny = np.random.standard_normal((1000, 2))\n\n\nplt.scatter(y[:, 0], y[:, 1], marker='o')\nplt.xlabel('1st')\nplt.ylabel('2nd')\nplt.title('Scatter Plot')\n\n\n# ### Scatter plot\n# * Adding a third dimension via a colour map:\n\n\nc = np.random.randint(0, 10, len(y))\n\n\nplt.scatter(y[:, 0], y[:, 1],\n c=c,\n cmap='coolwarm',\n marker='o')\nplt.colorbar()\nplt.xlabel('1st')\nplt.ylabel('2nd')\nplt.title('Scatter Plot')\n\n\n# ### Histogram\n\n\nplt.hist(y, label=['1st', '2nd'], bins=25)\nplt.legend(loc=0)\nplt.xlabel('value')\nplt.ylabel('frequency')\nplt.title('Histogram')\n\n\n# ### Histogram\n# * Parameters for `plt.hist()`:\n#\n# ![Parameters for plt.hist](pics/plt_hist_1.png)\n#\n# ![Parameters for plt.hist](pics/plt_hist_2.png)\n#\n#\n#
Source: Python for Finance, 2nd ed.
\n\n# ### Histogram\n\n\n\n", "project_metadata": {"full_name": "packham/compfin", "description": "Computational Finance and FinTech / M.Sc. International Finance / Berlin School of Economics and Law Berlin", "topics": [], "git_url": "git://github.com/packham/compfin.git", "stars": 3, "watchers": 3, "forks": 5, "created": "2019-02-24T19:55:19Z", "size": 17837, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1222171, "JavaScript": 142289, "CSS": 51622}, "last_updated": "2020-05-28T08:15:57Z"}, "intent": "# * A stacked histogram:"}, {"original_comment": "# The Transpose can be taken with the attribute T:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Numpy basics\n\n# This notebook was adapted from the one made by Brian d'Alessandro for the class DS-GA-1001 Introduction to Data Science where it was in turn adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing.\n#\n# NumPy has many features that will not be covered here. The following snippets are just to illustrate basic data types and operations within numpy.\n#\n# The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing. Another good resource for learning more about ndarrays is here:\n# http://docs.scipy.org/doc/numpy/reference/arrays.html\n\n# First import numpy:\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport math\nimport numpy as np\n\n\n# ## Creating arrays\n\n# The easiest way to create an array is to transform a standard Python list(s) using the *array* function.\n\n#%%\n\n# It is easy to create Nx1 and NxM arrays from standard Python lists\nlist1 = [0, 1, 2]\nlist2 = [3, 4, 5]\n\nnd1 = np.array(list1)\nnd2 = np.array([list1, list2])\n\n#%%\n\nnd1\n\n#%%\n\nnd2\n\n\n# Now, we can ask for some basic info to describe the ndarray using the *shape* and *dtype* properties of the ndarray objects. While it might be counterintuitive looking at the above output, one dimensional arrays are always treated as column vectors.\n\n#%%\n\ndef desc_ndarray(nd):\n try:\n print(\"The ndarray has dimension n=%s and m=%s\" %\n (nd.shape[0], nd.shape[1]))\n except:\n print(\"The ndarray has dimension n=%s and m=1\" % nd.shape[0])\n print(\"The ndarray has elements of type=%s\" % nd.dtype)\n\n\ndesc_ndarray(nd1)\ndesc_ndarray(nd2)\n\n\n# ## Special arrays\n\n# There are shortcuts for creating certain frequently used special ndarrays, for example, an array of zeros, ones, and independent standard normal random elements:\n\n#%%\n\nk = 4\nn = 2\nm = 3\n\n# An ndarray of all zeros\nzero = np.zeros((n, m))\n\n# An ndarray of all ones\none = np.ones(k)\n\n# An ndarray of random elements (this one is standard normal,\n# but there are many distributions to choose from). Notice the\n# difference in passing the dimensions\nrand = np.random.randn(3, 5)\n\n[zero, one, rand]\n\n\n# To generate a range that is automatically a numpy array, we can use the *arange()* method:\n\n#%%\n\nnp.arange(100)\n\n\n# ## Array indexing\n\n# For indexing an array:\n# 1. If nx1 array, follow the same protocol for a regular Python list\n# 2. If nxm array look at the following examples.\n# Do not forget that indexing starts at zero!\n\n#%%\n\n# A single index gets a full row, while 2 indexes returns a value\n[rand, rand[1], rand[1, 1]]\n\n\n# You might ask how to select a column. We can use slicing, which you can also do on lists:\n#\n# https://www.pythoncentral.io/how-to-slice-listsarrays-and-tuples-in-python/\n\n#%%\n\nrand[:, 2]\n\n\n# ## Operations between Arrays and Scalars\n\n# An important feature of ndarrays is they allow batch operations on data without writing any for loops. This is called vectorization. Any arithmetic operations between equal-size arrays applies the operation elementwise. Arithmetic operations with scalars are also as you would expect, propagating the value to each element also referred to as broadcasting:\n#\n# https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html\n\n#%%\n\nk = 4\nrand = np.random.randn(k)\n[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n\n#%%\n\narr = np.array([[1., 2., 3.], [4., 5., 6.]])\n\n[1/arr, arr ** 0.5]\n\n\n# It is easy to do matrix operations with ndarrays. It is important to make sure matrix shapes are compatible.\n#\n# Matrix addition is the standard matrix operator:\n\n#%%\n\nk = 3\nr1 = np.random.randn(k)\nr2 = np.random.randn(k)\n\n[r1, r2, r1 + r2]", "target_code": "arr2d = np.array([r1, r2])\n[arr2d, arr2d.T]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Numpy basics\n\n# This notebook was adapted from the one made by Brian d'Alessandro for the class DS-GA-1001 Introduction to Data Science where it was in turn adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing.\n#\n# NumPy has many features that will not be covered here. The following snippets are just to illustrate basic data types and operations within numpy.\n#\n# The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing. Another good resource for learning more about ndarrays is here:\n# http://docs.scipy.org/doc/numpy/reference/arrays.html\n\n# First import numpy:\n\n\nimport matplotlib.pyplot as plt\nimport math\nimport numpy as np\n\n\n# ## Creating arrays\n\n# The easiest way to create an array is to transform a standard Python list(s) using the *array* function.\n\n\n# It is easy to create Nx1 and NxM arrays from standard Python lists\nlist1 = [0, 1, 2]\nlist2 = [3, 4, 5]\n\nnd1 = np.array(list1)\nnd2 = np.array([list1, list2])\n\n\nnd1\n\n\nnd2\n\n\n# Now, we can ask for some basic info to describe the ndarray using the *shape* and *dtype* properties of the ndarray objects. While it might be counterintuitive looking at the above output, one dimensional arrays are always treated as column vectors.\n\n\ndef desc_ndarray(nd):\n try:\n print(\"The ndarray has dimension n=%s and m=%s\" %\n (nd.shape[0], nd.shape[1]))\n except:\n print(\"The ndarray has dimension n=%s and m=1\" % nd.shape[0])\n print(\"The ndarray has elements of type=%s\" % nd.dtype)\n\n\ndesc_ndarray(nd1)\ndesc_ndarray(nd2)\n\n\n# ## Special arrays\n\n# There are shortcuts for creating certain frequently used special ndarrays, for example, an array of zeros, ones, and independent standard normal random elements:\n\n\nk = 4\nn = 2\nm = 3\n\n# An ndarray of all zeros\nzero = np.zeros((n, m))\n\n# An ndarray of all ones\none = np.ones(k)\n\n# An ndarray of random elements (this one is standard normal,\n# but there are many distributions to choose from). Notice the\n# difference in passing the dimensions\nrand = np.random.randn(3, 5)\n\n[zero, one, rand]\n\n\n# To generate a range that is automatically a numpy array, we can use the *arange()* method:\n\n\nnp.arange(100)\n\n\n# ## Array indexing\n\n# For indexing an array:\n# 1. If nx1 array, follow the same protocol for a regular Python list\n# 2. If nxm array look at the following examples.\n# Do not forget that indexing starts at zero!\n\n\n# A single index gets a full row, while 2 indexes returns a value\n[rand, rand[1], rand[1, 1]]\n\n\n# You might ask how to select a column. We can use slicing, which you can also do on lists:\n#\n# https://www.pythoncentral.io/how-to-slice-listsarrays-and-tuples-in-python/\n\n\nrand[:, 2]\n\n\n# ## Operations between Arrays and Scalars\n\n# An important feature of ndarrays is they allow batch operations on data without writing any for loops. This is called vectorization. Any arithmetic operations between equal-size arrays applies the operation elementwise. Arithmetic operations with scalars are also as you would expect, propagating the value to each element also referred to as broadcasting:\n#\n# https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html\n\n\nk = 4\nrand = np.random.randn(k)\n[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n\n\narr = np.array([[1., 2., 3.], [4., 5., 6.]])\n\n[1/arr, arr ** 0.5]\n\n\n# It is easy to do matrix operations with ndarrays. It is important to make sure matrix shapes are compatible.\n#\n# Matrix addition is the standard matrix operator:\n\n\nk = 3\nr1 = np.random.randn(k)\nr2 = np.random.randn(k)\n\n[r1, r2, r1 + r2]\n\n\n\n", "project_metadata": {"full_name": "Moosquibe/Mathematical-Statistics", "description": "This repository contains notes related to the course I teach in Mathematical Statistics at NYU 2018 Spring", "topics": [], "git_url": "git://github.com/Moosquibe/Mathematical-Statistics.git", "stars": 4, "watchers": 4, "forks": 5, "created": "2017-11-29T00:59:05Z", "size": 3463, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3558412}, "last_updated": "2018-04-22T22:15:28Z"}, "intent": "# The Transpose can be taken with the attribute T:"}, {"original_comment": "# Generate weights\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Demo: Linear Models\n# - [Problem Statement](#problem-statement)\n# - [Model](#solution)\n# 1. [Initialise Libraries](#initialise-libraries)\n# 2. [Create Helper Functions](#helper-functions)\n# - [Visualization](#visualization)\n# - [Data Extraction](#data-extraction)\n# - [! Model Function](#model)\n# - [! Loss Function](#loss)\n# - [! Gradient Descent](#gradient-descent)\n# 3. [Train Model](#train-model)\n# - [Initialise Training Variables](#initialise)\n# - [Training Script](#training-script)\n# - [Plot Graph](#plot-graph)\n\n# # Problem Statement\n\n# A study is trying to investigate a correlation between the population of a city and the average profit of a household of its' population. The study has collected a dataset of population (in 1000s) and average profit per household (in $1000s). Based on the spread of the data, a **linear model** is selected to describe the relationship. Plot the linear model after 1500 iterations of training on the data\n# $$\\hat{y^{(i)}}=w_{0}+w_{1}*x^{(i)}$$\n#\n# A sample of the dataset is visualized below:\n#\n# | Population (in 1000s) | Profit per household (in 1000s) |\n# | ---------------------- | ------------------------------- |\n# | 6.1101 | 17.592 |\n# | 5.5277 | 9.1302 |\n# | 8.5186 | 13.662 |\n# | 7.0032 | 11.854 |\n\n# # Model\n\n# ## 1. Initialise Libraries\n# Essential libraries for linear algebra operations and visualization.\n\n#%%\n\n# scientific computing library\nimport warnings\nimport numpy as np\n\n# visualization tools\nimport matplotlib.pyplot as plt\n\n# show plots without need of calling `.show()`\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# prettify plots\nplt.rcParams['figure.figsize'] = [20, 10]\n\n# supress warnings\nwarnings.filterwarnings('ignore')\n\n\n# ## 2. Create Helper Functions\n\n# ### Visualization Function\n\n#%%\n\n# Function for plotting the graph\ndef visualize(x, y=None, y_hat=None):\n plt.xlabel(\"Population in 10,000s\")\n plt.ylabel(\"Profit in 10,000s\")\n if y is not None and y_hat is not None:\n plt.plot(x, y, 'x', label='Observed Values')\n plt.plot(x, y_hat, '-', label='Model Predictions')\n plt.legend()\n\n\n# ### Extracting Data\n\n#%%\n\n# function to load files\ndef loader(filename):\n c = np.loadtxt('./data/%s.txt' % filename, delimiter=',')\n x = c[:, :-1] # extract every column except the last column\n y = c[:, -1] # extract the last column\n return x, y\n\n\n# Load x and y values\nx, y = loader('ex1data1')\nx = x.flatten()\n\n\n# ### ! Model Function\n# $$\\hat{y^{(i)}}=w_{0}+w_{1}*x^{(i)}$$\n\n#%%\n\ndef simple_linear_model(w0, w1, x1):\n #TO-DO#\n # Implement function that outputs y_hat for a mx1 size dataset of x\n\n return y_hat\n\n\n# ### ! Loss Function- Mean Square Error\n#\n# $$MSE = \\frac{1}{2m} \\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})^{2}$$\n\n#%%\n\ndef calculate_MSE(y, y_hat, size):\n #TO-DO#\n # Calculate the mean squared error based on the difference between model predictions and observed data.\n # HINT: Make use of the function np.sum() to sum the squares of the differences\n\n return loss\n\n\n# ### ! Gradient Descent\n# $$w_{0} = w_{0} - \\alpha \\frac{1}{m}\\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})$$\n# $$w_{1} = w_{1} - \\alpha \\frac{1}{m}\\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})x^{(i)}$$\n\n#%%\n\ndef gradient_descent(w0, w1, x, y, y_hat, size, learning_rate=0.1):\n #TO-DO#\n # Implement function that adjusts the weights, w0 and w1 based on the gradient descent algorithm\n # HINT: Remember that for w1, the difference have to be multiplied by x before summing\n\n return w0, w1\n\n\n# ## 3. Train Model\n\n# ### Initialise Training Variables\n\n#%%\n\nn_epochs = 1500 # number of iterations\nnp.random.seed(0) # seeding to persist results\nm = y.size # Get total number of data samples", "target_code": "w0 = np.random.rand(1)\nw1 = np.random.rand(1)\nprint(\"weights are: \", w0, w1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Demo: Linear Models\n# - [Problem Statement](#problem-statement)\n# - [Model](#solution)\n# 1. [Initialise Libraries](#initialise-libraries)\n# 2. [Create Helper Functions](#helper-functions)\n# - [Visualization](#visualization)\n# - [Data Extraction](#data-extraction)\n# - [! Model Function](#model)\n# - [! Loss Function](#loss)\n# - [! Gradient Descent](#gradient-descent)\n# 3. [Train Model](#train-model)\n# - [Initialise Training Variables](#initialise)\n# - [Training Script](#training-script)\n# - [Plot Graph](#plot-graph)\n\n# # Problem Statement\n\n# A study is trying to investigate a correlation between the population of a city and the average profit of a household of its' population. The study has collected a dataset of population (in 1000s) and average profit per household (in $1000s). Based on the spread of the data, a **linear model** is selected to describe the relationship. Plot the linear model after 1500 iterations of training on the data\n# $$\\hat{y^{(i)}}=w_{0}+w_{1}*x^{(i)}$$\n#\n# A sample of the dataset is visualized below:\n#\n# | Population (in 1000s) | Profit per household (in 1000s) |\n# | ---------------------- | ------------------------------- |\n# | 6.1101 | 17.592 |\n# | 5.5277 | 9.1302 |\n# | 8.5186 | 13.662 |\n# | 7.0032 | 11.854 |\n\n# # Model\n\n# ## 1. Initialise Libraries\n# Essential libraries for linear algebra operations and visualization.\n\n\n# scientific computing library\nimport warnings\nimport numpy as np\n\n# visualization tools\nimport matplotlib.pyplot as plt\n\n# show plots without need of calling `.show()`\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# prettify plots\nplt.rcParams['figure.figsize'] = [20, 10]\n\n# supress warnings\nwarnings.filterwarnings('ignore')\n\n\n# ## 2. Create Helper Functions\n\n# ### Visualization Function\n\n\n# Function for plotting the graph\ndef visualize(x, y=None, y_hat=None):\n plt.xlabel(\"Population in 10,000s\")\n plt.ylabel(\"Profit in 10,000s\")\n if y is not None and y_hat is not None:\n plt.plot(x, y, 'x', label='Observed Values')\n plt.plot(x, y_hat, '-', label='Model Predictions')\n plt.legend()\n\n\n# ### Extracting Data\n\n\n# function to load files\ndef loader(filename):\n c = np.loadtxt('./data/%s.txt' % filename, delimiter=',')\n x = c[:, :-1] # extract every column except the last column\n y = c[:, -1] # extract the last column\n return x, y\n\n\n# Load x and y values\nx, y = loader('ex1data1')\nx = x.flatten()\n\n\n# ### ! Model Function\n# $$\\hat{y^{(i)}}=w_{0}+w_{1}*x^{(i)}$$\n\n\ndef simple_linear_model(w0, w1, x1):\n #TO-DO#\n # Implement function that outputs y_hat for a mx1 size dataset of x\n\n return y_hat\n\n\n# ### ! Loss Function- Mean Square Error\n#\n# $$MSE = \\frac{1}{2m} \\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})^{2}$$\n\n\ndef calculate_MSE(y, y_hat, size):\n #TO-DO#\n # Calculate the mean squared error based on the difference between model predictions and observed data.\n # HINT: Make use of the function np.sum() to sum the squares of the differences\n\n return loss\n\n\n# ### ! Gradient Descent\n# $$w_{0} = w_{0} - \\alpha \\frac{1}{m}\\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})$$\n# $$w_{1} = w_{1} - \\alpha \\frac{1}{m}\\sum_{i=0}^m(\\hat{y^{(i)}} - {y^{(i)}})x^{(i)}$$\n\n\ndef gradient_descent(w0, w1, x, y, y_hat, size, learning_rate=0.1):\n #TO-DO#\n # Implement function that adjusts the weights, w0 and w1 based on the gradient descent algorithm\n # HINT: Remember that for w1, the difference have to be multiplied by x before summing\n\n return w0, w1\n\n\n# ## 3. Train Model\n\n# ### Initialise Training Variables\n\n\nn_epochs = 1500 # number of iterations\nnp.random.seed(0) # seeding to persist results\nm = y.size # Get total number of data samples\n", "project_metadata": {"full_name": "DurhamAI/The-Academy-of-AI", "description": null, "topics": [], "git_url": "git://github.com/DurhamAI/The-Academy-of-AI.git", "stars": 3, "watchers": 3, "forks": 18, "created": "2018-10-21T14:56:23Z", "size": 37303, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1504961, "Python": 26763}, "last_updated": "2020-10-06T19:59:57Z"}, "intent": "# Generate weights"}, {"original_comment": "# Average polution of air by countries\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Global Air Pollution Measurements\n#\n# * [Air Quality Index - Wiki](https://en.wikipedia.org/wiki/Air_quality_index)\n# * [BigQuery - Wiki](https://en.wikipedia.org/wiki/BigQuery)\n#\n# In this notebook data is extracted from *BigQuery Public Data* assesible exclusively only in *Kaggle*. The BigQurey Helper Object will convert data in cloud storage into *Pandas DataFrame* object. The query syntax is same as *SQL*. As size of data is very high convert entire data to DataFrame is cumbersome. So query is written such that will be readly available for Visualization.\n# ***\n# >**Baisc attributes of Air quality index**\n# * Measurement units\n# * $ug/m^3$: micro gram/cubic meter\n# * $ppm$: Parts Per Million\n# * Pollutant\n# * $O3$: Ozone gas\n# * $SO2$: Sulphur Dioxed\n# * $NO2$: Nitrogen Dioxed\n# * $PM 2.5$: Particles with an aerodynamic diameter less than $2.5 \u03bcm$\n# * $PM 10$: Particles with an aerodynamic diameter less than $10 \u03bcm$\n# * $CO$: Carbon monoxide\n#\n# **Steps**\n# 1. Load Packages\n# 2. Bigquery Object\n# 3. AQI range and Statistics\n# 4. Distribution of country listed in AQI\n# 5. Location\n# 6. Air Quality Index value distribution Map veiw\n# 7. Pollutant Statistics\n# 8. Distribution of pollutant and unit\n# 9. Distribution of Source name\n# 10. Sample AQI Averaged over in hours\n# 11. AQI variation with time\n# 12. Country Heatmap\n# 13. Animation\n\n# ### Load packages\n\n#%%\n\n# Load packages\nfrom matplotlib import animation, rc\nfrom IPython.display import HTML, display\nimport base64\nimport io\nimport bq_helper\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom mpl_toolkits.basemap import Basemap\nimport folium\nimport folium.plugins as plugins\n\nimport warnings\nwarnings.filterwarnings('ignore')\npd.options.display.max_rows = 10\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Bigquery\n# BigQuery is a RESTful web service that enables interactive analysis of massively large datasets working in conjunction with Google Storage. It is an Infrastructure as a Service that may be used complementarily with MapReduce.\n\n#%%\n\n# Customized query helper function explosively in Kaggle\n\n# Helper object\nopenAQ = bq_helper.BigQueryHelper(active_project='bigquery-public-data',\n dataset_name='openaq')\n# List of table\nopenAQ.list_tables()\n\n#%%\n\n# Schema\nopenAQ.table_schema('global_air_quality')\n\n\n# ### Table display\n\n#%%\n\nopenAQ.head('global_air_quality')\n\n#%%\n\n# Summary statics\nquery = \"\"\"SELECT value,averaged_over_in_hours\n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3'\n \"\"\"\np1 = openAQ.query_to_pandas(query)\np1.describe()\n\n\n# # Air Quality Index Range\n# * [AQI Range](http://aqicn.org/faq/2013-09-09/revised-pm25-aqi-breakpoints/)\n#
\n#\n# The range of AQI is 0 - 500, so lets limit data to that range, in previous kernel's these outlier data points are not removed\n\n#%%\n\nquery = \"\"\"SELECT value,country \n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3' AND value < 0\n \"\"\"\np1 = openAQ.query_to_pandas(query)\np1.describe().T\n\n\n# There are more than 100 value having value less than 0. The lowest value is -999000, which is outlier data point. **Air Quality Meter** is digital a instruments, if meter is show error value then sensor is disconnected or faulty.\n\n#%%\n\nquery2 = \"\"\"SELECT value,country,pollutant\n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3' AND value > 0\n \"\"\"\np2 = openAQ.query_to_pandas(query2)\nprint('0.99 Quantile', p2['value'].quantile(0.99))\np2.describe().T\n\n#%%\n\np2[p2['value'] > 10000]\n\n\n# Country\n# * MK is *Macedonia* [wiki](https://en.wikipedia.org/wiki/Republic_of_Macedonia)\n# * CL is *Chile* [Wiki](https://en.wikipedia.org/wiki/Chile)\n# >In both the countries some may some natural disaster happend so AQI is very high.\n# We will disgrad value more than 10000, which are outlier data point\n\n# ### Distribution of country listed in AQI\n\n#%%\n\nquery = \"\"\"SELECT country,COUNT(country) as `count`\n FROM `bigquery-public-data.openaq.global_air_quality`\n GROUP BY country\n HAVING COUNT(country) >10\n ORDER BY `count`\n \"\"\"\ncnt = openAQ.query_to_pandas_safe(query)\ncnt.tail()\n\nplt.style.use('bmh')\nplt.figure(figsize=(14, 4))\nsns.barplot(cnt['country'], cnt['count'], palette='magma')\nplt.xticks(rotation=45)\nplt.title('Distribution of country listed in data')\n\n\n# ## Location\n# We find find different location where air quality is taken. This location data consist of latitude and logitude, city.\n\n#%%", "target_code": "query = \"\"\"SELECT AVG(value) as `Average`,country\n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3' AND value BETWEEN 0 AND 10000\n GROUP BY country\n ORDER BY Average DESC\n \"\"\"\ncnt = openAQ.query_to_pandas(query)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Global Air Pollution Measurements\n#\n# * [Air Quality Index - Wiki](https://en.wikipedia.org/wiki/Air_quality_index)\n# * [BigQuery - Wiki](https://en.wikipedia.org/wiki/BigQuery)\n#\n# In this notebook data is extracted from *BigQuery Public Data* assesible exclusively only in *Kaggle*. The BigQurey Helper Object will convert data in cloud storage into *Pandas DataFrame* object. The query syntax is same as *SQL*. As size of data is very high convert entire data to DataFrame is cumbersome. So query is written such that will be readly available for Visualization.\n# ***\n# >**Baisc attributes of Air quality index**\n# * Measurement units\n# * $ug/m^3$: micro gram/cubic meter\n# * $ppm$: Parts Per Million\n# * Pollutant\n# * $O3$: Ozone gas\n# * $SO2$: Sulphur Dioxed\n# * $NO2$: Nitrogen Dioxed\n# * $PM 2.5$: Particles with an aerodynamic diameter less than $2.5 \u03bcm$\n# * $PM 10$: Particles with an aerodynamic diameter less than $10 \u03bcm$\n# * $CO$: Carbon monoxide\n#\n# **Steps**\n# 1. Load Packages\n# 2. Bigquery Object\n# 3. AQI range and Statistics\n# 4. Distribution of country listed in AQI\n# 5. Location\n# 6. Air Quality Index value distribution Map veiw\n# 7. Pollutant Statistics\n# 8. Distribution of pollutant and unit\n# 9. Distribution of Source name\n# 10. Sample AQI Averaged over in hours\n# 11. AQI variation with time\n# 12. Country Heatmap\n# 13. Animation\n\n# ### Load packages\n\n\n# Load packages\nfrom matplotlib import animation, rc\nfrom IPython.display import HTML, display\nimport base64\nimport io\nimport bq_helper\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom mpl_toolkits.basemap import Basemap\nimport folium\nimport folium.plugins as plugins\n\nimport warnings\nwarnings.filterwarnings('ignore')\npd.options.display.max_rows = 10\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Bigquery\n# BigQuery is a RESTful web service that enables interactive analysis of massively large datasets working in conjunction with Google Storage. It is an Infrastructure as a Service that may be used complementarily with MapReduce.\n\n\n# Customized query helper function explosively in Kaggle\n\n# Helper object\nopenAQ = bq_helper.BigQueryHelper(active_project='bigquery-public-data',\n dataset_name='openaq')\n# List of table\nopenAQ.list_tables()\n\n\n# Schema\nopenAQ.table_schema('global_air_quality')\n\n\n# ### Table display\n\n\nopenAQ.head('global_air_quality')\n\n\n# Summary statics\nquery = \"\"\"SELECT value,averaged_over_in_hours\n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3'\n \"\"\"\np1 = openAQ.query_to_pandas(query)\np1.describe()\n\n\n# # Air Quality Index Range\n# * [AQI Range](http://aqicn.org/faq/2013-09-09/revised-pm25-aqi-breakpoints/)\n#
\n#\n# The range of AQI is 0 - 500, so lets limit data to that range, in previous kernel's these outlier data points are not removed\n\n\nquery = \"\"\"SELECT value,country \n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3' AND value < 0\n \"\"\"\np1 = openAQ.query_to_pandas(query)\np1.describe().T\n\n\n# There are more than 100 value having value less than 0. The lowest value is -999000, which is outlier data point. **Air Quality Meter** is digital a instruments, if meter is show error value then sensor is disconnected or faulty.\n\n\nquery2 = \"\"\"SELECT value,country,pollutant\n FROM `bigquery-public-data.openaq.global_air_quality`\n WHERE unit = '\u00b5g/m\u00b3' AND value > 0\n \"\"\"\np2 = openAQ.query_to_pandas(query2)\nprint('0.99 Quantile', p2['value'].quantile(0.99))\np2.describe().T\n\n\np2[p2['value'] > 10000]\n\n\n# Country\n# * MK is *Macedonia* [wiki](https://en.wikipedia.org/wiki/Republic_of_Macedonia)\n# * CL is *Chile* [Wiki](https://en.wikipedia.org/wiki/Chile)\n# >In both the countries some may some natural disaster happend so AQI is very high.\n# We will disgrad value more than 10000, which are outlier data point\n\n# ### Distribution of country listed in AQI\n\n\nquery = \"\"\"SELECT country,COUNT(country) as `count`\n FROM `bigquery-public-data.openaq.global_air_quality`\n GROUP BY country\n HAVING COUNT(country) >10\n ORDER BY `count`\n \"\"\"\ncnt = openAQ.query_to_pandas_safe(query)\ncnt.tail()\n\nplt.style.use('bmh')\nplt.figure(figsize=(14, 4))\nsns.barplot(cnt['country'], cnt['count'], palette='magma')\nplt.xticks(rotation=45)\nplt.title('Distribution of country listed in data')\n\n\n# ## Location\n# We find find different location where air quality is taken. This location data consist of latitude and logitude, city.\n\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# Average polution of air by countries"}, {"original_comment": "# define simple function with no arguments\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# Date: Dec 7 2020\n#\n#\n# # Python Basics Part 1 ( 25 mins)\n# \n#\n#\n# ### What we will do:\n# 2. Create variables and assign values to them\n# 2. Examine data types - numeric, string, lists\n# 3. Perform simple arithmetic operations\n# 4. Manipulating text aka \u2018string\u2019 data\n# 5. Manipulating lists\n#\n#\n# \n\n# ## 1. Working with Variables\n#\n# A variable is a named (arbitrary) location in memory that stores data. That's it\n# - Variables store data of all kinds\n# - Variables are the fundamental way of handling data and doing operations in any programming language\n#\n# ## To do:\n# - assign values to a variable using the assignment operator \"=\"\n# - use print() built-in function to output the contents of a variable\n# - overwrite the contents of a variable\n# - use del delete a variable\n\n#%%\n\n# creating variables\nnum = 500\n\n#%%\n\n\n\n#%%\n\n# print the contents of a variable\nprint(num)\n\n#%%\n\nnum\n\n#%%\n\n# print multiple variables in different line\nnum1 = 100\nprint(num)\nprint(num1)\n\n#%%\n\n# print multiple variables in same line\nprint(num, num1)\n\n#%%\n\n# overwrite a variable\nnum = 1234\n\n#%%\n\nprint(num)\n\n#%%\n\n# delete a variable from memory\ndel num\n\n#%%\n\nprint(num)\n\n\n#\n# ### Rules for Naming variables:\n# - Simple but descriptive names\n# - Case-sensitive spelling\n# - Variables CANNOT contain spaces, use underscores or CamelCase instead\n# - CANNOT begin with a number\n# - Can take almost any name EXCEPT for a python reserved words\n\n#%%\n\n# variables are case-sensitive\nA = 3\n\n#%%\n\nprint(A)\n\n#%%\n\nprint(a)\n\n#%%\n\n# ## 2. Data Types in Python\n#\n# - Python is so popular, in part, because it supports several different kinds of data\n# - Knowing the type of a data is important because it determines the:\n# - range of values it can take on\n# - kind of operations that can be performed on it\n#\n# \n\n# # 3. Arithmetic operations with numeric data types\n#\n# - +, -, *****, /, //, %, ****** \n\n#%%\n\n# example of numeric integer (whole number)\nnum1 = 4\nprint(num1)\n\n#%%\n\n# example of numeric float\nnum2 = 8.2\n\n#%%\n\n# checking the data type of a variable\ntype(num1)\n\n#%%\n\ntype(num2)\n\n\n# #### Convert between data types\n\n#%%\n\n# convert integers <--> floats\na = 4.5\nb = 10\n\n#%%\n\n# convert a to an integer\nint(a)\n\n#%%\n\nprint(a)\n\n#%%\n\n# permanently change a to an integer\na = int(a)\ntype(a)\n\n#%%\n\nprint(a)\n\n#%%\n\n# convert float to an integer\nfloat(b)\n\n#%%\n\n# common arithmetic operations\n28 % 2 # modulo % returns the remainder of a division\n\n#%%\n\n29 % 2\n\n\n#\n\n#%%\n\n\n\n#%%\n\n# # 4. Manipulating 'string' data\n#\n# - Basically, anything enclosed in a single '' or double quotations \" \" is a python string\n# - String is a collection data structures i.e. a collection of one or more characters enclosed in quotaion marks ''\n#\n# ## What we'll do\n# 1. concatenate two or more strings with + \n# 1. find the length of a string using len()\n# 3. count the number of occurences a specific character in a string\n# 5. replace all occurences of c a character in a string\n# 1. convert string of digits to numeric data type\n\n#%%\n\n# create string\ns1 = 'dog'\ns2 = 'hot'\n\n#%%\n\n# find length of a string\nlen(s1)\n\n#%%\n\n# count the number occurences of a character in a string\ns3 = 'Canada'\ns3.count('a')\n\n#%%\n\n# join multiple strings\nnew = s2+s1\nprint(new)\n\n#%%\n\n# insert a hyphen in between the strings\nnew = s2+'-'+s1\nprint(new)\n\n#%%\n\n# convert a string of digits to a numeric data type\nnum_str = '1200'\ntype(num_str)\n\n#%%\n\nnum_str/20\n\n#%%\n\n# convert to an integer\nnum = int(num_str)\n\n#%%\n\nprint(num)\n\n#%%\n\nnum/20\n\n#%%\n\nx = 50\ntype(x)\n\n#%%\n\n# what command to convert x to a string\nx = str(x)\n\n#%%\n\ntype(x)\n\n#%%\n\nprint(x)\n\n\n# # 5. Manipulating lists\n#\n#\n# - List is an ordered collection of items e.g. list of numbers, list of strings, list of variables\n# - Items in a list are enclosed by a square braces [ ] \n#\n# ### What we'll do:\n# 1. find the number of items of a list with len() \n# 2. count the occurrences of items of a list .count() \n# 2. add items to list with .append()\n# 5. remove item from a list with .remove() \n# 4. check if a value occurs in a list with in \n\n#%%\n\n# list of values of same data type\nlist1 = [5, 10, 15, 20, 25]\nprint(list1)\n\n#%%\n\n# list of values of different data types\nmix_list = ['cat', 20, 'a', 19.2]\nprint(mix_list)\n\n#%%\n\n# create a list of consecutive numbers between 0 and 100\nlist2 = list(range(0, 100))\nprint(list2)\n\n#%%\n\n# find size of a list\nlen(list2)\n\n#%%\n\n# count occurences of an item in a list\nlist3 = ['eggs', 'ham', 'bacon', 'bread', 'eggs']\nlist3.count('eggs')\n\n#%%\n\nlist3.count('coffee')\n\n#%%\n\n# add an item ot the end of a list\nlist3.append('coffee')\nprint(list3)\n\n#%%\n\n# remove items from a list\nlist3.remove('bacon')\nprint(list3)\n\n#%%\n\n# remove omits the first instance of an item\nlist3.remove('eggs')\nprint(list3)\n\n#%%\n\nlist\n\n\n# ## Questions/Comments? \n# # .\n# # .\n# # .\n# ## Coding Activity (5mins):\n#\n# Instructions: Copy the code below and answer the related questions:\n# ```python\n# s1 = 'the lazy brown fox quickly jumped over the fence\"\n# s2 = 'indomitable'\n# ```\n#\n# 2. How many times does the character 'o' occur in s1?\n# 2. replace the word \"fox\" with \"dog\" in s1\n# 2. what is the result of dividing the length of s1 by s2?\n\n#\n\n# ## Python Basics Part 2 (15mins) -\n# # Indexing and Slicing Lists and Strings\n#\n# - Both the lists and strings are collection data structures\n# - In collection data structures, python keeps track of the unique position , or index of each item\n#\n#\n# ### Indexing\n# Indexing means to refer to a specific item in a collection data structure based on its position (aka index).\n# - Zero-indexing : The first element in a string, list, etc begins at index Zero [0]\n#\n#

\n# \n#

\n#\n\n#%%\n\n# use the string index to access and manipulate a specific element\ns1 = 'breakfast'\n\n#%%\n\n# to index the first element\ns1[0]\n\n#%%\n\n# last element of a string\ns1[-1]\n\n#%%\n\n# last element of a string - longer way\ns1[len(s1)-1]\n\n#%%\n\nlen(s1)\n\n#%%\n\ns1[len(s1)]\n\n#%%\n\n# use indexing to make specific edits\nl = ['cats', 'dogs', 'guinea pigs', 'donkey']\n\n#%%\n\n# replace donkeys with fish\nl[-1] = 'fish'\nprint(l)\n\n#%%\n\nl[2] # third element\n\n#%%\n\n# editing list\nlist2 = [50, 60, 70, 80, 90]\nprint(list2)\n\n#%%\n\n# change 60 to 6000\nlist2[1] = 6000\nprint(list2)\n\n\n# ### Slicing [:]\n# - Slicing - accessing a range of elements from a collection data structure.\n# - uses the range operator \" : \" \n# \n\n#%%\n\n# slice a list to obtain a subset with the first 3 elements of a list\nnum_list = list(range(0, 100))\nprint(num_list)\n\n#%%\n\n# slice a list to obtain a subset with numbers 0 to 15 of a list\nnum_list[0:15]\n\n#%%\n\n# slice a list to obtain a subset with numbers 0 to 15 of a list\nnum_list[0:16]\n\n#%%\n\nnum_list[4:10]\n\n#%%\n\nnum_list[4:10+1]\n\n#%%\n\n# overwriting python's built-in list\n\n#list = ['cat', 'dog']\n\n#%%\n\nnum_list = list(range(0, 100))\n\n\n# # Questions/Comments? \n# # .\n# # .\n# # Break\n#\n\n#\n\n#%%\n\n# # Python Control Structures\n#\n#\n#\n# ## Objectives:\n#\n# 1. Write decision-making code using if-else statements\n# 2. Use for loops to perform iterative actions on strings and lists\n#\n#

\n# In the previous lesson, we covered data structures - numerics, strings, lists - which are the different ways for representing data in python.\n#\n#\n# A control structure is a block of code that determines the order in which commands are executed.\n#\n#\n# \n\n# ## If-else statements: How to write conditional code\n#\n# The IF statement adds a extra layer of control and selectivity to our code.\n# - if the condition is true, then python executes a block of statements called the if-block.\n# - If the statement is false, the interpreter skips the ifblock and processes another block of statements called the else-block.\n#\n# \n#\n#\n# ### what we'll do:\n# 1. Introduce comparison operators: >, <, ==, !=\n# 2. write simple if-statement\n# 2. write if-else statements\n# 3. write compound conditions with logical AND, OR,\n\n#%%\n\nnum = 42\n\n#%%\n\n# simple if statement to determine if a number is odd\nif(num % 2 == 1):\n print('Number is odd')\n\n#%%\n\nnum = 51\nif(num % 2 == 1):\n print('Number is odd')\n\n#%%\n\n# if-else statement to find odd number\nnum = 38\n\nif(num % 2 == 1):\n print('Number is odd')\nelse:\n print('Number is even') # only printed if the condition is False\n\n\n# ## comparison operators\n# - these compare two or more values\n# - These operators return a Boolean : True or False\n\n#%%\n\n# less than\n2 < 10\n\n#%%\n\n# greater than\n10 > 100\n\n#%%\n\n# equal to\n2 == 2\n\n#%%\n\n\"cat\" == \"cat\"\n\n#%%\n\n# not equal to\n\"cat\" != \"dog\"\n\n#%%\n\n# ### Multiple conditions with IF- Logical comparisons\n# ![image.png](attachment:image.png)\n\n#%%\n\n# write a voting eligibility program - AND\n\nage = 18\ncitizen = 'yes'\n\nif(age >= 18) and (citizen == 'yes'):\n print('Eligible to Vote!')\nelse:\n print('Not eligible to vote')\n\n#%%\n\n# write a voting eligibility program - AND\n\nage = 16\ncitizen = 'yes'\n\nif(age >= 18) and (citizen == 'yes'):\n print('Eligible to Vote!')\nelse:\n print('Not eligible to vote')\n\n#%%\n\n# write a credit card eligibility program - OR\n\nemployed = 'yes'\nage = 18\n\nif(employed == 'yes') or (age >= 21):\n print('Eligible!')\nelse:\n print('Not eligible')\n\n#%%\n\nemployed = 'no'\nage = 18\n\nif(employed == 'yes') or (age >= 21):\n print('Eligible!')\nelse:\n print('Not eligible')\n\n#%%\n\n# Boolean\n(2 != 2.0)\n\n#%%\n\n('sky' == 'sky') and ('car' == 'car')\n\n#%%\n\ntype(True)\n\n\n# # Questions/Comments? \n# # .\n# # .\n\n# # For loops\n#\n# - Looping is used to repeatedly execute a command\n# - For loops are used to traverse collection data structures (strings, lists, etc), and executes the same statement at each iteration\n#\n# ### What we'll do:\n# 1. Use for statement to iterate over a string\n# 2. use for statement to iterate over a list\n\n#%%\n\n# manually print each character\ns1 = \"Hello\"\nprint(s1[0]) # output the first character\nprint(s1[1])\nprint(s1[2])\nprint(s1[3])\nprint(s1[4])\n\n#%%\n\n# iterativel print each character with for loop\nfor ch in s1:\n print(ch)\n\n#%%\n\n# write a for loop that capitalizes each word in the list\nwords = ['sky', 'blue', 'moon']\n\nfor w in words:\n print(w.upper()) # str.upper() converts to uppercase\n\n#%%\n\nprint(w)\n\n#%%\n\n\n\n#%%\n\n# # Functions\n# - Functions are re-usable code that do something.\n#\n#\n# #### Defining a function\n# \n#\n# ### What we'll do:\n#\n# - define a function\n# - call a function\n# - combine control structures and functions\n\n#%%", "target_code": "def message():\n print('Texas is the best!')\n return\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# Date: Dec 7 2020\n#\n#\n# # Python Basics Part 1 ( 25 mins)\n# \n#\n#\n# ### What we will do:\n# 2. Create variables and assign values to them\n# 2. Examine data types - numeric, string, lists\n# 3. Perform simple arithmetic operations\n# 4. Manipulating text aka \u2018string\u2019 data\n# 5. Manipulating lists\n#\n#\n# \n\n# ## 1. Working with Variables\n#\n# A variable is a named (arbitrary) location in memory that stores data. That's it\n# - Variables store data of all kinds\n# - Variables are the fundamental way of handling data and doing operations in any programming language\n#\n# ## To do:\n# - assign values to a variable using the assignment operator \"=\"\n# - use print() built-in function to output the contents of a variable\n# - overwrite the contents of a variable\n# - use del delete a variable\n\n\n# creating variables\nnum = 500\n\n\n\n\n\n# print the contents of a variable\nprint(num)\n\n\nnum\n\n\n# print multiple variables in different line\nnum1 = 100\nprint(num)\nprint(num1)\n\n\n# print multiple variables in same line\nprint(num, num1)\n\n\n# overwrite a variable\nnum = 1234\n\n\nprint(num)\n\n\n# delete a variable from memory\ndel num\n\n\nprint(num)\n\n\n#\n# ### Rules for Naming variables:\n# - Simple but descriptive names\n# - Case-sensitive spelling\n# - Variables CANNOT contain spaces, use underscores or CamelCase instead\n# - CANNOT begin with a number\n# - Can take almost any name EXCEPT for a python reserved words\n\n\n# variables are case-sensitive\nA = 3\n\n\nprint(A)\n\n\nprint(a)\n\n\n# ## 2. Data Types in Python\n#\n# - Python is so popular, in part, because it supports several different kinds of data\n# - Knowing the type of a data is important because it determines the:\n# - range of values it can take on\n# - kind of operations that can be performed on it\n#\n# \n\n# # 3. Arithmetic operations with numeric data types\n#\n# - +, -, *****, /, //, %, ****** \n\n\n# example of numeric integer (whole number)\nnum1 = 4\nprint(num1)\n\n\n# example of numeric float\nnum2 = 8.2\n\n\n# checking the data type of a variable\ntype(num1)\n\n\ntype(num2)\n\n\n# #### Convert between data types\n\n\n# convert integers <--> floats\na = 4.5\nb = 10\n\n\n# convert a to an integer\nint(a)\n\n\nprint(a)\n\n\n# permanently change a to an integer\na = int(a)\ntype(a)\n\n\nprint(a)\n\n\n# convert float to an integer\nfloat(b)\n\n\n# common arithmetic operations\n28 % 2 # modulo % returns the remainder of a division\n\n\n29 % 2\n\n\n#\n\n\n\n\n\n# # 4. Manipulating 'string' data\n#\n# - Basically, anything enclosed in a single '' or double quotations \" \" is a python string\n# - String is a collection data structures i.e. a collection of one or more characters enclosed in quotaion marks ''\n#\n# ## What we'll do\n# 1. concatenate two or more strings with + \n# 1. find the length of a string using len()\n# 3. count the number of occurences a specific character in a string\n# 5. replace all occurences of c a character in a string\n# 1. convert string of digits to numeric data type\n\n\n# create string\ns1 = 'dog'\ns2 = 'hot'\n\n\n# find length of a string\nlen(s1)\n\n\n# count the number occurences of a character in a string\ns3 = 'Canada'\ns3.count('a')\n\n\n# join multiple strings\nnew = s2+s1\nprint(new)\n\n\n# insert a hyphen in between the strings\nnew = s2+'-'+s1\nprint(new)\n\n\n# convert a string of digits to a numeric data type\nnum_str = '1200'\ntype(num_str)\n\n\nnum_str/20\n\n\n# convert to an integer\nnum = int(num_str)\n\n\nprint(num)\n\n\nnum/20\n\n\nx = 50\ntype(x)\n\n\n# what command to convert x to a string\nx = str(x)\n\n\ntype(x)\n\n\nprint(x)\n\n\n# # 5. Manipulating lists\n#\n#\n# - List is an ordered collection of items e.g. list of numbers, list of strings, list of variables\n# - Items in a list are enclosed by a square braces [ ] \n#\n# ### What we'll do:\n# 1. find the number of items of a list with len() \n# 2. count the occurrences of items of a list .count() \n# 2. add items to list with .append()\n# 5. remove item from a list with .remove() \n# 4. check if a value occurs in a list with in \n\n\n# list of values of same data type\nlist1 = [5, 10, 15, 20, 25]\nprint(list1)\n\n\n# list of values of different data types\nmix_list = ['cat', 20, 'a', 19.2]\nprint(mix_list)\n\n\n# create a list of consecutive numbers between 0 and 100\nlist2 = list(range(0, 100))\nprint(list2)\n\n\n# find size of a list\nlen(list2)\n\n\n# count occurences of an item in a list\nlist3 = ['eggs', 'ham', 'bacon', 'bread', 'eggs']\nlist3.count('eggs')\n\n\nlist3.count('coffee')\n\n\n# add an item ot the end of a list\nlist3.append('coffee')\nprint(list3)\n\n\n# remove items from a list\nlist3.remove('bacon')\nprint(list3)\n\n\n# remove omits the first instance of an item\nlist3.remove('eggs')\nprint(list3)\n\n\nlist\n\n\n# ## Questions/Comments? \n# # .\n# # .\n# # .\n# ## Coding Activity (5mins):\n#\n# Instructions: Copy the code below and answer the related questions:\n# ```python\n# s1 = 'the lazy brown fox quickly jumped over the fence\"\n# s2 = 'indomitable'\n# ```\n#\n# 2. How many times does the character 'o' occur in s1?\n# 2. replace the word \"fox\" with \"dog\" in s1\n# 2. what is the result of dividing the length of s1 by s2?\n\n#\n\n# ## Python Basics Part 2 (15mins) -\n# # Indexing and Slicing Lists and Strings\n#\n# - Both the lists and strings are collection data structures\n# - In collection data structures, python keeps track of the unique position , or index of each item\n#\n#\n# ### Indexing\n# Indexing means to refer to a specific item in a collection data structure based on its position (aka index).\n# - Zero-indexing : The first element in a string, list, etc begins at index Zero [0]\n#\n#

\n# \n#

\n#\n\n\n# use the string index to access and manipulate a specific element\ns1 = 'breakfast'\n\n\n# to index the first element\ns1[0]\n\n\n# last element of a string\ns1[-1]\n\n\n# last element of a string - longer way\ns1[len(s1)-1]\n\n\nlen(s1)\n\n\ns1[len(s1)]\n\n\n# use indexing to make specific edits\nl = ['cats', 'dogs', 'guinea pigs', 'donkey']\n\n\n# replace donkeys with fish\nl[-1] = 'fish'\nprint(l)\n\n\nl[2] # third element\n\n\n# editing list\nlist2 = [50, 60, 70, 80, 90]\nprint(list2)\n\n\n# change 60 to 6000\nlist2[1] = 6000\nprint(list2)\n\n\n# ### Slicing [:]\n# - Slicing - accessing a range of elements from a collection data structure.\n# - uses the range operator \" : \" \n# \n\n\n# slice a list to obtain a subset with the first 3 elements of a list\nnum_list = list(range(0, 100))\nprint(num_list)\n\n\n# slice a list to obtain a subset with numbers 0 to 15 of a list\nnum_list[0:15]\n\n\n# slice a list to obtain a subset with numbers 0 to 15 of a list\nnum_list[0:16]\n\n\nnum_list[4:10]\n\n\nnum_list[4:10+1]\n\n\n# overwriting python's built-in list\n\n#list = ['cat', 'dog']\n\n\nnum_list = list(range(0, 100))\n\n\n# # Questions/Comments? \n# # .\n# # .\n# # Break\n#\n\n#\n\n\n# # Python Control Structures\n#\n#\n#\n# ## Objectives:\n#\n# 1. Write decision-making code using if-else statements\n# 2. Use for loops to perform iterative actions on strings and lists\n#\n#

\n# In the previous lesson, we covered data structures - numerics, strings, lists - which are the different ways for representing data in python.\n#\n#\n# A control structure is a block of code that determines the order in which commands are executed.\n#\n#\n# \n\n# ## If-else statements: How to write conditional code\n#\n# The IF statement adds a extra layer of control and selectivity to our code.\n# - if the condition is true, then python executes a block of statements called the if-block.\n# - If the statement is false, the interpreter skips the ifblock and processes another block of statements called the else-block.\n#\n# \n#\n#\n# ### what we'll do:\n# 1. Introduce comparison operators: >, <, ==, !=\n# 2. write simple if-statement\n# 2. write if-else statements\n# 3. write compound conditions with logical AND, OR,\n\n\nnum = 42\n\n\n# simple if statement to determine if a number is odd\nif(num % 2 == 1):\n print('Number is odd')\n\n\nnum = 51\nif(num % 2 == 1):\n print('Number is odd')\n\n\n# if-else statement to find odd number\nnum = 38\n\nif(num % 2 == 1):\n print('Number is odd')\nelse:\n print('Number is even') # only printed if the condition is False\n\n\n# ## comparison operators\n# - these compare two or more values\n# - These operators return a Boolean : True or False\n\n\n# less than\n2 < 10\n\n\n# greater than\n10 > 100\n\n\n# equal to\n2 == 2\n\n\n\"cat\" == \"cat\"\n\n\n# not equal to\n\"cat\" != \"dog\"\n\n\n# ### Multiple conditions with IF- Logical comparisons\n# ![image.png](attachment:image.png)\n\n\n# write a voting eligibility program - AND\n\nage = 18\ncitizen = 'yes'\n\nif(age >= 18) and (citizen == 'yes'):\n print('Eligible to Vote!')\nelse:\n print('Not eligible to vote')\n\n\n# write a voting eligibility program - AND\n\nage = 16\ncitizen = 'yes'\n\nif(age >= 18) and (citizen == 'yes'):\n print('Eligible to Vote!')\nelse:\n print('Not eligible to vote')\n\n\n# write a credit card eligibility program - OR\n\nemployed = 'yes'\nage = 18\n\nif(employed == 'yes') or (age >= 21):\n print('Eligible!')\nelse:\n print('Not eligible')\n\n\nemployed = 'no'\nage = 18\n\nif(employed == 'yes') or (age >= 21):\n print('Eligible!')\nelse:\n print('Not eligible')\n\n\n# Boolean\n(2 != 2.0)\n\n\n('sky' == 'sky') and ('car' == 'car')\n\n\ntype(True)\n\n\n# # Questions/Comments? \n# # .\n# # .\n\n# # For loops\n#\n# - Looping is used to repeatedly execute a command\n# - For loops are used to traverse collection data structures (strings, lists, etc), and executes the same statement at each iteration\n#\n# ### What we'll do:\n# 1. Use for statement to iterate over a string\n# 2. use for statement to iterate over a list\n\n\n# manually print each character\ns1 = \"Hello\"\nprint(s1[0]) # output the first character\nprint(s1[1])\nprint(s1[2])\nprint(s1[3])\nprint(s1[4])\n\n\n# iterativel print each character with for loop\nfor ch in s1:\n print(ch)\n\n\n# write a for loop that capitalizes each word in the list\nwords = ['sky', 'blue', 'moon']\n\nfor w in words:\n print(w.upper()) # str.upper() converts to uppercase\n\n\nprint(w)\n\n\n\n\n\n# # Functions\n# - Functions are re-usable code that do something.\n#\n#\n# #### Defining a function\n# \n#\n# ### What we'll do:\n#\n# - define a function\n# - call a function\n# - combine control structures and functions\n\n\n", "project_metadata": {"full_name": "rhondene/data_science_ML_lessons_biology", "description": "Tutorials on data science/machine learning using Python with a life-science perspective. When I started learning programming and machine learning, most of the examples were from other fields like IT or business, so sometimes I found it difficult to apply those skills to biological problems. I help make data science and machine learning more relatable and practical to beginners, especially in my major biology by sharing the skills and knowledge I have acquired in data science and computation. I will do my best to use biological datasets as examples.", "topics": [], "git_url": "git://github.com/rhondene/data_science_ML_lessons_biology.git", "stars": 6, "watchers": 6, "forks": 2, "created": "2018-11-24T05:24:15Z", "size": 3043, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1921673}, "last_updated": "2020-12-27T19:21:54Z"}, "intent": "# define simple function with no arguments"}, {"original_comment": "# ## Top 25 States where applicants apply most\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom matplotlib.pyplot import pie, axis, show\nimport matplotlib as mpl\nfrom decimal import Decimal\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# ### Questions to answers are\n#\n# 1. Top 25 companies which hires most h1b1 visa applicants\n# 2. Top 25 companies according to their Salary\n# 3. Top 25 States where applicants apply most\n# 4. Top 25 Job Titles\n# 5. Top 25 Job Titles according to Salary\n# 6. Top 25 States which Denys most\n# 7. Growth Increase per year in Number of Applications for h1b visa analysis\n# 8. Growth Rate by year ( Certified h1b1 Visa Applicantes )\n# 9. Deny Rate by Year\n# 10. How many applicantes are not Full Time Job and what is average salaray\n# 11. What is Average Salary of top 25 Companies ( according to number of applicantes)\n# 12. What is Average Salary of top 25 Companies who pay high salaries\n\n#%%\n\ndf = pd.read_csv(\"F://python/Advanced python//h1b_kaggle.csv\")\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.shape\n\n#%%\n\ndf.head()\n\n#%%\n\ndrop_columns = [\"Unnamed: 0\", \"FULL_TIME_POSITION\", \"lon\", \"lat\"]\n\nfor col in drop_columns:\n df.drop(col, axis=1, inplace=True)\n\n#%%\n\ndf.drop(\"SOC_NAME\", axis=1, inplace=True)\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.shape\n\n#%%\n\ndf.rename(mapper={\n \"CASE_STATUS\": \"STATUS\",\n \"EMPLOYER_NAME\": \"COMPANY\",\n \"PREVAILING_WAGE\": \"SALARY\",\n \"WORKSITE\": \"STATE\"\n}, axis=1, inplace=True)\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.dropna(inplace=True)\n\n#%%\n\ndf.shape\n\n#%%\n\ndf[:60000].to_csv('my_visa.csv')\n\n#%%\n\ntop_company = df['COMPANY'].value_counts()[:25]\n\n#%%\n\ntop_company\n\n#%%\n\nplt.style.use(\"seaborn-whitegrid\")\nfig, ax = plt.subplots(figsize=(10, 4), facecolor=\"#ABCDEF\")\nsns.barplot(top_company.index, top_company.values, ax=ax)\nplt.title(\"TOP 25 COMPANIES WHICH HIRES MOST H1B VISA APPLICANTS\", fontsize=20)\nplt.xlabel(\"Company name\", fontsize=15)\nplt.ylabel(\"NO. OF APPLICANTS\", fontsize=15)\nplt.xticks(rotation=90)\nplt.savefig(\"TOP 25 COMPANIES WHICH HIRES MOST H1B VISA APPLICANTS.jpg\")\nplt.show()\n\n\n# ## Top 25 companies according to their Salary\n\n#%%\n\nTop_salary = df[['COMPANY', 'SALARY']].groupby(\n 'COMPANY').mean().sort_values(by=\"SALARY\", ascending=False)[:25]\nTop_salary\n\n#%%\n\nfig, ax = plt.subplots(figsize=(10, 4), facecolor=\"#123456\")\nsns.barplot(x=Top_salary.index, y=Top_salary['SALARY'], ax=ax)\nplt.title(\"TOP 25 COMPANIES ACCORDING TO HIGHEST SALARY USING H1b VISA\",\n color=\"white\", fontsize=18)\nplt.xlabel(\"Company name\", color=\"white\", fontsize=15)\nplt.ylabel(\"Salary\", color=\"white\", fontsize=15)\nplt.xticks(rotation=90, color=\"white\")\nplt.yticks(color=\"white\")\nplt.savefig(\"TOP 25 COMPANIES ACCORDING TO HIGHEST SALARY USING H1b VISA\")\nplt.show()", "target_code": "Top_states = df[\"STATE\"].value_counts(ascending=False)[:25]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom matplotlib.pyplot import pie, axis, show\nimport matplotlib as mpl\nfrom decimal import Decimal\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Questions to answers are\n#\n# 1. Top 25 companies which hires most h1b1 visa applicants\n# 2. Top 25 companies according to their Salary\n# 3. Top 25 States where applicants apply most\n# 4. Top 25 Job Titles\n# 5. Top 25 Job Titles according to Salary\n# 6. Top 25 States which Denys most\n# 7. Growth Increase per year in Number of Applications for h1b visa analysis\n# 8. Growth Rate by year ( Certified h1b1 Visa Applicantes )\n# 9. Deny Rate by Year\n# 10. How many applicantes are not Full Time Job and what is average salaray\n# 11. What is Average Salary of top 25 Companies ( according to number of applicantes)\n# 12. What is Average Salary of top 25 Companies who pay high salaries\n\n\ndf = pd.read_csv(\"F://python/Advanced python//h1b_kaggle.csv\")\n\n\ndf.columns\n\n\ndf.shape\n\n\ndf.head()\n\n\ndrop_columns = [\"Unnamed: 0\", \"FULL_TIME_POSITION\", \"lon\", \"lat\"]\n\nfor col in drop_columns:\n df.drop(col, axis=1, inplace=True)\n\n\ndf.drop(\"SOC_NAME\", axis=1, inplace=True)\n\n\ndf.head()\n\n\ndf.shape\n\n\ndf.rename(mapper={\n \"CASE_STATUS\": \"STATUS\",\n \"EMPLOYER_NAME\": \"COMPANY\",\n \"PREVAILING_WAGE\": \"SALARY\",\n \"WORKSITE\": \"STATE\"\n}, axis=1, inplace=True)\n\n\ndf.head()\n\n\ndf.dropna(inplace=True)\n\n\ndf.shape\n\n\ndf[:60000].to_csv('my_visa.csv')\n\n\ntop_company = df['COMPANY'].value_counts()[:25]\n\n\ntop_company\n\n\nplt.style.use(\"seaborn-whitegrid\")\nfig, ax = plt.subplots(figsize=(10, 4), facecolor=\"#ABCDEF\")\nsns.barplot(top_company.index, top_company.values, ax=ax)\nplt.title(\"TOP 25 COMPANIES WHICH HIRES MOST H1B VISA APPLICANTS\", fontsize=20)\nplt.xlabel(\"Company name\", fontsize=15)\nplt.ylabel(\"NO. OF APPLICANTS\", fontsize=15)\nplt.xticks(rotation=90)\nplt.savefig(\"TOP 25 COMPANIES WHICH HIRES MOST H1B VISA APPLICANTS.jpg\")\nplt.show()\n\n\n# ## Top 25 companies according to their Salary\n\n\nTop_salary = df[['COMPANY', 'SALARY']].groupby(\n 'COMPANY').mean().sort_values(by=\"SALARY\", ascending=False)[:25]\nTop_salary\n\n\nfig, ax = plt.subplots(figsize=(10, 4), facecolor=\"#123456\")\nsns.barplot(x=Top_salary.index, y=Top_salary['SALARY'], ax=ax)\nplt.title(\"TOP 25 COMPANIES ACCORDING TO HIGHEST SALARY USING H1b VISA\",\n color=\"white\", fontsize=18)\nplt.xlabel(\"Company name\", color=\"white\", fontsize=15)\nplt.ylabel(\"Salary\", color=\"white\", fontsize=15)\nplt.xticks(rotation=90, color=\"white\")\nplt.yticks(color=\"white\")\nplt.savefig(\"TOP 25 COMPANIES ACCORDING TO HIGHEST SALARY USING H1b VISA\")\nplt.show()\n\n\n\n", "project_metadata": {"full_name": "ayushkhandelwal123/Data-Science-Project-1", "description": "Analysis on h1b visa dataset", "topics": [], "git_url": "git://github.com/ayushkhandelwal123/Data-Science-Project-1.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2019-07-24T16:12:28Z", "size": 2311, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1328493}, "last_updated": "2020-10-26T16:04:23Z"}, "intent": "# Top 25 States where applicants apply most"}, {"original_comment": "# Flatten torsions involving unique old atoms in old system\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Generate normal htfs\n\n#%%\n\nfrom perses.annihilation.relative import RepartitionedHybridTopologyFactory\nfrom simtk import openmm\nfrom perses.tests.utils import validate_endstate_energies\nfrom perses.annihilation.relative import HybridTopologyFactory\nfrom openmmtools.constants import kB\nfrom perses.rjmc.geometry import FFAllAngleGeometryEngine\nfrom perses.rjmc.topology_proposal import PointMutationEngine\nfrom perses.tests.test_topology_proposal import generate_atp, generate_dipeptide_top_pos_sys\nfrom simtk.openmm import app, unit\nfrom openmmforcefields.generators import SystemGenerator\nimport numpy as np\nimport os\nimport pickle\n\nimport logging\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\n#%%\n\n# Generate htf for capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\nhtf_at = generate_dipeptide_top_pos_sys(atp.topology,\n new_res='THR',\n system=atp.system,\n positions=atp.positions,\n system_generator=sys_gen,\n conduct_htf_prop=True,\n validate_endstate_energy=True)\n\n#%%\n\n# Generate htf for capped THR->ALA in vacuum\npdb = app.PDBFile(\"../input/thr_vacuum.pdb\")\n\nforcefield_files = ['amber14/protein.ff14SB.xml', 'amber14/tip3p.xml']\nbarostat = None\nsystem_generator = SystemGenerator(forcefields=forcefield_files,\n barostat=barostat,\n forcefield_kwargs={'removeCMMotion': False,\n 'ewaldErrorTolerance': 1e-4,\n 'constraints': app.HBonds,\n 'hydrogenMass': 4 * unit.amus},\n periodic_forcefield_kwargs=None,\n small_molecule_forcefield='gaff-2.11',\n nonperiodic_forcefield_kwargs={\n 'nonbondedMethod': app.NoCutoff},\n molecules=None,\n cache=None)\nsystem = system_generator.create_system(pdb.topology)\npositions = unit.quantity.Quantity(value=np.array(\n [list(atom_pos) for atom_pos in pdb.positions.value_in_unit_system(unit.md_unit_system)]), unit=unit.nanometers)\n\nhtf_ta = generate_dipeptide_top_pos_sys(pdb.topology,\n new_res='ALA',\n system=system,\n positions=positions,\n system_generator=system_generator,\n conduct_htf_prop=True,\n validate_endstate_energy=True)\n\n\n# # Generate normal htfs with flattened torsions and exceptions: ALA -> THR\n\n# ## ALA -> THR lambda = 0\n\n#%%\n\n# Get topology and system capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\n#%%\n\n# Generate topology proposal\npoint_mutation_engine = PointMutationEngine(wildtype_topology=atp.topology,\n system_generator=sys_gen,\n # denote the chain id allowed to mutate (it's always a string variable)\n chain_id='1',\n max_point_mutants=1,\n # the residue ids allowed to mutate\n residues_allowed_to_mutate=['2'],\n # the residue ids allowed to mutate with the three-letter code allowed to change\n allowed_mutations=[('2', 'THR')],\n aggregate=True) # always allow aggregation\ntopology_proposal = point_mutation_engine.propose(\n current_system=atp.system, current_topology=atp.topology)\n\n#%%\n\nunique_new = topology_proposal.unique_new_atoms\n\n#%%\n\nunique_old = topology_proposal.unique_old_atoms\n\n#%%\n\n# Flatten torsions involving unique new atoms in new system\nperiodic_torsion = topology_proposal.new_system.getForce(2)\nfor i in range(periodic_torsion.getNumTorsions()):\n p1, p2, p3, p4, periodicity, phase, k = periodic_torsion.getTorsionParameters(\n i)\n if p1 in unique_new or p2 in unique_new or p3 in unique_new or p4 in unique_new:\n periodic_torsion.setTorsionParameters(\n i, p1, p2, p3, p4, periodicity, phase, 0.*k)\n\n#%%\n\n# Flatten exceptions involving unique new atoms in new system\nnb_force = topology_proposal.new_system.getForce(3)\nfor i in range(nb_force.getNumExceptions()):\n p1, p2, chargeProd, sigma, epsilon = nb_force.getExceptionParameters(i)\n if p1 in unique_new or p2 in unique_new:\n nb_force.setExceptionParameters(i, p1, p2, 0, sigma, 0)\n\n#%%\n\n# Generate geometry proposal\n\n\ntemperature = 300*unit.kelvin\n# Compute kT and inverse temperature.\nkT = kB * temperature\nbeta = 1.0 / kT\nENERGY_THRESHOLD = 1e-6\n\ngeometry_engine = FFAllAngleGeometryEngine(metadata=None,\n use_sterics=False,\n n_bond_divisions=100,\n n_angle_divisions=180,\n n_torsion_divisions=360,\n verbose=True,\n storage=None,\n bond_softening_constant=1.0,\n angle_softening_constant=1.0,\n neglect_angles=False,\n use_14_nonbondeds=True)\n\nforward_new_positions, logp_proposal = geometry_engine.propose(\n topology_proposal, atp.positions, beta, validate_energy_bookkeeping=True)\nlogp_reverse = geometry_engine.logp_reverse(\n topology_proposal, forward_new_positions, atp.positions, beta, validate_energy_bookkeeping=True)\n\n#%%\n\n# Build new htf and validate_endstate_energies\nforward_htf = HybridTopologyFactory(topology_proposal=topology_proposal,\n current_positions=atp.positions,\n new_positions=forward_new_positions,\n use_dispersion_correction=False,\n functions=None,\n softcore_alpha=None,\n bond_softening_constant=1.0,\n angle_softening_constant=1.0,\n soften_only_new=False,\n neglected_new_angle_terms=[],\n neglected_old_angle_terms=[],\n softcore_LJ_v2=True,\n softcore_electrostatics=True,\n softcore_LJ_v2_alpha=0.85,\n softcore_electrostatics_alpha=0.3,\n softcore_sigma_Q=1.0,\n interpolate_old_and_new_14s=False,\n omitted_terms=None)\n\nif not topology_proposal.unique_new_atoms:\n assert geometry_engine.forward_final_context_reduced_potential == None, f\"There are no unique new atoms but the geometry_engine's final context reduced potential is not None (i.e. {self._geometry_engine.forward_final_context_reduced_potential})\"\n assert geometry_engine.forward_atoms_with_positions_reduced_potential == None, f\"There are no unique new atoms but the geometry_engine's forward atoms-with-positions-reduced-potential in not None (i.e. { self._geometry_engine.forward_atoms_with_positions_reduced_potential})\"\n vacuum_added_valence_energy = 0.0\nelse:\n added_valence_energy = geometry_engine.forward_final_context_reduced_potential - \\\n geometry_engine.forward_atoms_with_positions_reduced_potential\n\nif not topology_proposal.unique_old_atoms:\n assert geometry_engine.reverse_final_context_reduced_potential == None, f\"There are no unique old atoms but the geometry_engine's final context reduced potential is not None (i.e. {self._geometry_engine.reverse_final_context_reduced_potential})\"\n assert geometry_engine.reverse_atoms_with_positions_reduced_potential == None, f\"There are no unique old atoms but the geometry_engine's atoms-with-positions-reduced-potential in not None (i.e. { self._geometry_engine.reverse_atoms_with_positions_reduced_potential})\"\n subtracted_valence_energy = 0.0\nelse:\n subtracted_valence_energy = geometry_engine.reverse_final_context_reduced_potential - \\\n geometry_engine.reverse_atoms_with_positions_reduced_potential\n\nzero_state_error, one_state_error = validate_endstate_energies(forward_htf._topology_proposal, forward_htf, added_valence_energy, subtracted_valence_energy, beta=1.0/(\n kB*temperature), ENERGY_THRESHOLD=ENERGY_THRESHOLD, platform=openmm.Platform.getPlatformByName('Reference'))\nprint(f\"zero state error : {zero_state_error}\")\nprint(f\"one state error : {one_state_error}\")\n\n\n# ## ALA -> THR lambda = 1\n\n#%%\n\n# Get topology and system capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\n#%%\n\n# Generate topology proposal\npoint_mutation_engine = PointMutationEngine(wildtype_topology=atp.topology,\n system_generator=sys_gen,\n # denote the chain id allowed to mutate (it's always a string variable)\n chain_id='1',\n max_point_mutants=1,\n # the residue ids allowed to mutate\n residues_allowed_to_mutate=['2'],\n # the residue ids allowed to mutate with the three-letter code allowed to change\n allowed_mutations=[('2', 'THR')],\n aggregate=True) # always allow aggregation\ntopology_proposal = point_mutation_engine.propose(\n current_system=atp.system, current_topology=atp.topology)\n\n#%%\n\nunique_new = topology_proposal.unique_new_atoms\n\n#%%\n\nunique_old = topology_proposal.unique_old_atoms\n\n#%%", "target_code": "periodic_torsion = topology_proposal.old_system.getForce(2)\nfor i in range(periodic_torsion.getNumTorsions()):\n p1, p2, p3, p4, periodicity, phase, k = periodic_torsion.getTorsionParameters(\n i)\n if p1 in unique_old or p2 in unique_old or p3 in unique_old or p4 in unique_old:\n periodic_torsion.setTorsionParameters(\n i, p1, p2, p3, p4, periodicity, phase, 0.*k)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Generate normal htfs\n\n\nfrom perses.annihilation.relative import RepartitionedHybridTopologyFactory\nfrom simtk import openmm\nfrom perses.tests.utils import validate_endstate_energies\nfrom perses.annihilation.relative import HybridTopologyFactory\nfrom openmmtools.constants import kB\nfrom perses.rjmc.geometry import FFAllAngleGeometryEngine\nfrom perses.rjmc.topology_proposal import PointMutationEngine\nfrom perses.tests.test_topology_proposal import generate_atp, generate_dipeptide_top_pos_sys\nfrom simtk.openmm import app, unit\nfrom openmmforcefields.generators import SystemGenerator\nimport numpy as np\nimport os\nimport pickle\n\nimport logging\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\n\n# Generate htf for capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\nhtf_at = generate_dipeptide_top_pos_sys(atp.topology,\n new_res='THR',\n system=atp.system,\n positions=atp.positions,\n system_generator=sys_gen,\n conduct_htf_prop=True,\n validate_endstate_energy=True)\n\n\n# Generate htf for capped THR->ALA in vacuum\npdb = app.PDBFile(\"../input/thr_vacuum.pdb\")\n\nforcefield_files = ['amber14/protein.ff14SB.xml', 'amber14/tip3p.xml']\nbarostat = None\nsystem_generator = SystemGenerator(forcefields=forcefield_files,\n barostat=barostat,\n forcefield_kwargs={'removeCMMotion': False,\n 'ewaldErrorTolerance': 1e-4,\n 'constraints': app.HBonds,\n 'hydrogenMass': 4 * unit.amus},\n periodic_forcefield_kwargs=None,\n small_molecule_forcefield='gaff-2.11',\n nonperiodic_forcefield_kwargs={\n 'nonbondedMethod': app.NoCutoff},\n molecules=None,\n cache=None)\nsystem = system_generator.create_system(pdb.topology)\npositions = unit.quantity.Quantity(value=np.array(\n [list(atom_pos) for atom_pos in pdb.positions.value_in_unit_system(unit.md_unit_system)]), unit=unit.nanometers)\n\nhtf_ta = generate_dipeptide_top_pos_sys(pdb.topology,\n new_res='ALA',\n system=system,\n positions=positions,\n system_generator=system_generator,\n conduct_htf_prop=True,\n validate_endstate_energy=True)\n\n\n# # Generate normal htfs with flattened torsions and exceptions: ALA -> THR\n\n# ## ALA -> THR lambda = 0\n\n\n# Get topology and system capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\n\n# Generate topology proposal\npoint_mutation_engine = PointMutationEngine(wildtype_topology=atp.topology,\n system_generator=sys_gen,\n # denote the chain id allowed to mutate (it's always a string variable)\n chain_id='1',\n max_point_mutants=1,\n # the residue ids allowed to mutate\n residues_allowed_to_mutate=['2'],\n # the residue ids allowed to mutate with the three-letter code allowed to change\n allowed_mutations=[('2', 'THR')],\n aggregate=True) # always allow aggregation\ntopology_proposal = point_mutation_engine.propose(\n current_system=atp.system, current_topology=atp.topology)\n\n\nunique_new = topology_proposal.unique_new_atoms\n\n\nunique_old = topology_proposal.unique_old_atoms\n\n\n# Flatten torsions involving unique new atoms in new system\nperiodic_torsion = topology_proposal.new_system.getForce(2)\nfor i in range(periodic_torsion.getNumTorsions()):\n p1, p2, p3, p4, periodicity, phase, k = periodic_torsion.getTorsionParameters(\n i)\n if p1 in unique_new or p2 in unique_new or p3 in unique_new or p4 in unique_new:\n periodic_torsion.setTorsionParameters(\n i, p1, p2, p3, p4, periodicity, phase, 0.*k)\n\n\n# Flatten exceptions involving unique new atoms in new system\nnb_force = topology_proposal.new_system.getForce(3)\nfor i in range(nb_force.getNumExceptions()):\n p1, p2, chargeProd, sigma, epsilon = nb_force.getExceptionParameters(i)\n if p1 in unique_new or p2 in unique_new:\n nb_force.setExceptionParameters(i, p1, p2, 0, sigma, 0)\n\n\n# Generate geometry proposal\n\n\ntemperature = 300*unit.kelvin\n# Compute kT and inverse temperature.\nkT = kB * temperature\nbeta = 1.0 / kT\nENERGY_THRESHOLD = 1e-6\n\ngeometry_engine = FFAllAngleGeometryEngine(metadata=None,\n use_sterics=False,\n n_bond_divisions=100,\n n_angle_divisions=180,\n n_torsion_divisions=360,\n verbose=True,\n storage=None,\n bond_softening_constant=1.0,\n angle_softening_constant=1.0,\n neglect_angles=False,\n use_14_nonbondeds=True)\n\nforward_new_positions, logp_proposal = geometry_engine.propose(\n topology_proposal, atp.positions, beta, validate_energy_bookkeeping=True)\nlogp_reverse = geometry_engine.logp_reverse(\n topology_proposal, forward_new_positions, atp.positions, beta, validate_energy_bookkeeping=True)\n\n\n# Build new htf and validate_endstate_energies\nforward_htf = HybridTopologyFactory(topology_proposal=topology_proposal,\n current_positions=atp.positions,\n new_positions=forward_new_positions,\n use_dispersion_correction=False,\n functions=None,\n softcore_alpha=None,\n bond_softening_constant=1.0,\n angle_softening_constant=1.0,\n soften_only_new=False,\n neglected_new_angle_terms=[],\n neglected_old_angle_terms=[],\n softcore_LJ_v2=True,\n softcore_electrostatics=True,\n softcore_LJ_v2_alpha=0.85,\n softcore_electrostatics_alpha=0.3,\n softcore_sigma_Q=1.0,\n interpolate_old_and_new_14s=False,\n omitted_terms=None)\n\nif not topology_proposal.unique_new_atoms:\n assert geometry_engine.forward_final_context_reduced_potential == None, f\"There are no unique new atoms but the geometry_engine's final context reduced potential is not None (i.e. {self._geometry_engine.forward_final_context_reduced_potential})\"\n assert geometry_engine.forward_atoms_with_positions_reduced_potential == None, f\"There are no unique new atoms but the geometry_engine's forward atoms-with-positions-reduced-potential in not None (i.e. { self._geometry_engine.forward_atoms_with_positions_reduced_potential})\"\n vacuum_added_valence_energy = 0.0\nelse:\n added_valence_energy = geometry_engine.forward_final_context_reduced_potential - \\\n geometry_engine.forward_atoms_with_positions_reduced_potential\n\nif not topology_proposal.unique_old_atoms:\n assert geometry_engine.reverse_final_context_reduced_potential == None, f\"There are no unique old atoms but the geometry_engine's final context reduced potential is not None (i.e. {self._geometry_engine.reverse_final_context_reduced_potential})\"\n assert geometry_engine.reverse_atoms_with_positions_reduced_potential == None, f\"There are no unique old atoms but the geometry_engine's atoms-with-positions-reduced-potential in not None (i.e. { self._geometry_engine.reverse_atoms_with_positions_reduced_potential})\"\n subtracted_valence_energy = 0.0\nelse:\n subtracted_valence_energy = geometry_engine.reverse_final_context_reduced_potential - \\\n geometry_engine.reverse_atoms_with_positions_reduced_potential\n\nzero_state_error, one_state_error = validate_endstate_energies(forward_htf._topology_proposal, forward_htf, added_valence_energy, subtracted_valence_energy, beta=1.0/(\n kB*temperature), ENERGY_THRESHOLD=ENERGY_THRESHOLD, platform=openmm.Platform.getPlatformByName('Reference'))\nprint(f\"zero state error : {zero_state_error}\")\nprint(f\"one state error : {one_state_error}\")\n\n\n# ## ALA -> THR lambda = 1\n\n\n# Get topology and system capped ALA->THR in vacuum\natp, sys_gen = generate_atp()\n\n\n# Generate topology proposal\npoint_mutation_engine = PointMutationEngine(wildtype_topology=atp.topology,\n system_generator=sys_gen,\n # denote the chain id allowed to mutate (it's always a string variable)\n chain_id='1',\n max_point_mutants=1,\n # the residue ids allowed to mutate\n residues_allowed_to_mutate=['2'],\n # the residue ids allowed to mutate with the three-letter code allowed to change\n allowed_mutations=[('2', 'THR')],\n aggregate=True) # always allow aggregation\ntopology_proposal = point_mutation_engine.propose(\n current_system=atp.system, current_topology=atp.topology)\n\n\nunique_new = topology_proposal.unique_new_atoms\n\n\nunique_old = topology_proposal.unique_old_atoms\n\n", "project_metadata": {"full_name": "zhang-ivy/perses_protein_mutations", "description": "testing and benchmarking perses protein mutations", "topics": [], "git_url": "git://github.com/zhang-ivy/perses_protein_mutations.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-05-27T19:13:11Z", "size": 26622, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 246931846, "Python": 613879, "Shell": 19603}, "last_updated": "2021-01-08T17:00:48Z"}, "intent": "# Flatten torsions involving unique old atoms in old system"}, {"original_comment": "# ## Number of notebooks per repository\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# [Language Use <](2_Languages.ipynb) | [> Notebook Organization](4_Organization.ipynb)\n\n# # Owners\n# This notebook looks at what types of owners (user, organization, educational, personal, and enterprise) have public notebooks on GitHub.\n#\n# ## Results Summary:\n# - 4.72% of all owners are organizations; they own 7.86% of notebooks.\n# - 25.82% of notebook owners only have one notebook on GitHub. 23.89% of notebook owners have over 10 notebooks on GitHub\n# - 54.13% of repositories have a description.\n# - Users have an average of 10.88 notebooks on GitHub (Median = 4). Organizations have an average of 19.03 notebooks on GitHub (Median = 4).\n# - 23.34% of repos with descriptions are educational. These repositories hold 28.564% of notebooks.\n# - 16.27% of individual users have primarily educational repositories while 18.12% of organizational users have primarily educational repositories.\n# - Educational owners tend to host more notebooks (mean 9.67, median 4) compared to non-educational owners (mean 8.61, median 3)\n\n# ----\n\n# # Import Packages & Load Data\n\n#%%\n\nfrom nltk.corpus import stopwords\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport re\nimport numpy as np\n\nimport load_data\nimport datetime\nimport pickle\n\n#%%\n\nnotebooks = load_data.load_notebooks()\nrepos = load_data.load_repos()\nowners = load_data.load_owners()\n\n\n# --------\n\n# # Manipulate Data\n\n# ## Separate types of owners: organizations vs users\n\n#%%\n\nusers = owners[owners.type == 'User']\norgs = owners[owners.type == 'Organization']\n\n#%%\n\nuser_nbs = notebooks[notebooks.owner_id.isin(users.owner_id)]\norg_nbs = notebooks[notebooks.owner_id.isin(orgs.owner_id)]\n\n\n# ## Identify educational owners\n# Define educational owners as owners whos repositories are primarily educational.\n# Define an educational repository as one that uses at least one of the educational words listed below. We are only considering repositories with descriptions as eligible.\n\n# ### Isolate educational repositories\n\n#%%\n\nedu_words = [\n 'teach', 'bootcamp', 'boot camp', 'capstone',\n 'final project', 'thesis', 'demonstrat', 'workshop',\n # space to avoid matching 'classification'\n 'lesson', 'course', 'classroom', 'study', 'class ',\n 'research', 'university', 'curriculum', 'udemy',\n 'udacity', 'coursea', 'mit ', 'exercise',\n 'example', 'instruction', 'tutorial', 'education',\n 'student', 'assignment', 'homework', 'problem set', 'lecture'\n]\n\n\n# Limit to repositories with descriptions.\n\n#%%\n\nrepos_has_desc = repos.copy()[~repos.repo_description.isna()]\nprint(\"{0}% of repositories have a description.\".format(round(\n 100*len(repos_has_desc) / len(repos), 2\n)))\n\n\n# Separate repositories with at least one educational word from those without.\n\n#%%\n\nrepos_has_desc['edu_word_count'] = [\n sum([1 if\n re.search('(?= 1]\nnon_edu_repos = repos_has_desc.copy()[repos_has_desc['edu_word_count'] == 0]\n\nedu_notebooks = notebooks.copy()[notebooks.repo_id.isin(edu_repos.repo_id)]\nnon_edu_notebooks = notebooks.copy(\n)[notebooks.repo_id.isin(non_edu_repos.repo_id)]\n\n\n# ### Visually inspect a sample of educational repos. Did we get them right?\n#\n# Most of these descriptions seem to represent educational repositories, held by students and instructors alike.\n#\n# These ones don't seem to fit: \"How to handle emoji in Python + a quick Python script to count emoji in Tweets as an example. (python 2.7)\", \"This is a webcrawler which gets lists of articles from Korea University Library site.\"\n\n#%%\n\nprint(\n edu_repos['repo_description'].sample(\n 10, random_state=50\n ).values\n)\n\n\n# ### Visually inspect a sample of non-educational repos\n#\n# Some repos are described in other languages, so this simple word-based check did not caputre them.\n# - \"\u548cPython\u76f8\u5173\u7684\u5b66\u4e60\u7b14\u8bb0\uff1a\u673a\u5668\u5b66\u4e60\u3001\u7b97\u6cd5\u3001\u8fdb\u9636\u4e66\u7c4d\u3001\u6587\u6863\uff0c\u535a\u5ba2\u5730\u5740\"\n# - translation: \"Python-related study notes: machine learning, algorithms, advanced books, documentation\"\n#\n# Other repos may be educational, but not for sure.\n# - \"Intro to Biomedical Computational Programming\"\n# - \"AM221 PSet\"\n# - \"Experimental physics modules III, IV & V - Thimo Preis\"\n\n#%%\n\nprint(\n non_edu_repos['repo_description'].sample(\n 10, random_state=50\n ).values\n)\n\n#%%\n\nrepo_edu_status = pd.DataFrame({\n 'repo_id': list(edu_repos.repo_id) + list(non_edu_repos.repo_id),\n 'edu': [True]*len(edu_repos) + [False]*len(non_edu_repos)\n})\n\n#%%\n\nf = open('analysis_data/repo_edu_status.df', 'wb')\npickle.dump(repo_edu_status, f)\nf.close()\n\n\n# ### Most common words within each group of repositories\n\n#%%\n\nstop = set(stopwords.words('english'))\n\n#%%\n\nprint(\"Most common words in descriptions of educational repositories\")\nprint(\"and the proportion of descriptions they appear in\")\nprint(\"(excluding educational words we selected for)\")\n(pd.Series([\n w.lower() for w in ' '.join(edu_repos['repo_description']).split()\n if w.lower() not in stop and sum([w.lower() in e or e in w.lower() for e in edu_words]) == 0 and len(w) > 2\n]).value_counts()/len(edu_repos))[:10]\n\n#%%\n\nprint(\"Most common words in descriptions of non-educational repositories\")\n(pd.Series([\n w.lower() for w in ' '.join(non_edu_repos['repo_description']).split()\n if w.lower() not in stop and len(w) > 2\n]).value_counts()/len(non_edu_repos))[:10]\n\n\n# ### Add repository type to repos dataframe\n\n#%%\n\nedu_repos['type'] = 'Educational'\nnon_edu_repos['type'] = 'Non-Educational'\nrepos = pd.concat([edu_repos, non_edu_repos])\n\n\n# ### Determine if *owners* are educational or not\n\n#%%\n\ntype_counts = repos.groupby(\n ['owner_id', 'type']\n)['repo_id'].count().reset_index().pivot(\n index='owner_id',\n columns='type',\n values='repo_id'\n).reset_index().fillna(0)\n\n#%%\n\ntype_counts['owner_type'] = [\n 'Educational' if r['Educational'] > r['Non-Educational']\n else ('Non-Educational' if r['Educational'] < r['Non-Educational']\n else '')\n for _, r in type_counts.iterrows()\n]\n\n#%%\n\ntype_counts = type_counts.merge(owners[['owner_id', 'type']], on='owner_id')\ntype_counts.head()\n\n\n# ---------\n\n# # Visualizations & Statistics\n\n# ## Owner types\n\n#%%\n\nprop_user = len(user_nbs) / (len(user_nbs) + len(org_nbs))\nprop_org = len(org_nbs) / (len(user_nbs) + len(org_nbs))\nprint(\"{0:,} ({1}%) of owners are organizations.\".format(\n len(orgs), round(100*len(orgs)/(len(orgs) + len(users)), 2),\n))\nprint(\"{0}% of notebooks are owned by organizations, while {1}% are owned by users.\".format(\n round(100*prop_org, 2), round(100*prop_user, 2)\n))\n\n\n# ## Notebooks per Owner\n\n#%%\n\nnbs_per_owner = notebooks.groupby('owner_id')['file'].count().reset_index()\n\n#%%\n\nplt.hist(nbs_per_owner.file, bins=500, color='teal')\nplt.xlim(0, 1500)\nplt.yscale('log')\nplt.xlabel('Notebooks')\nplt.ylabel('Owners')\nplt.title('Notebooks per Owner')\nplt.show()\n\n#%%\n\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(nbs_per_owner.file == 1)/len(nbs_per_owner), 2)\n))\n\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(nbs_per_owner.file > 10)/len(nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}, Range = {2}-{3}\".format(\n round(nbs_per_owner.file.mean(), 2),\n nbs_per_owner.file.median(),\n nbs_per_owner.file.min(),\n nbs_per_owner.file.max()\n))\n\n\n# ## Notebooks per Owner: users vs organizations\n\n#%%\n\nprint(\"For Users:\")\nuser_nbs_per_owner = user_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(user_nbs_per_owner.file == 1)/len(user_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(user_nbs_per_owner.file > 10)/len(user_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(user_nbs_per_owner.file.mean(), 2),\n user_nbs_per_owner.file.median()\n))\n\ntop_owner = owners[\n owners.owner_id == user_nbs_per_owner[\n user_nbs_per_owner.file == user_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top user: {0} with {1} notebooks.'.format(\n top_owner, user_nbs_per_owner.file.max()))\n\nprint(\"\\nFor Organizations:\")\norg_nbs_per_owner = org_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(org_nbs_per_owner.file == 1)/len(org_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(org_nbs_per_owner.file > 10)/len(org_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(org_nbs_per_owner.file.mean(), 2),\n org_nbs_per_owner.file.median()\n))\n\ntop_org = owners[\n owners.owner_id == org_nbs_per_owner[\n org_nbs_per_owner.file == org_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top organization: {0} with {1} notebooks.'.format(\n top_org, org_nbs_per_owner.file.max()))\n\n#%%\n\nfig = plt.figure(figsize=(10, 5))\n\nplt.subplot(2, 1, 1)\nplt.hist(\n user_nbs_per_owner.file,\n bins=50,\n color='teal'\n)\nplt.title(\"Notebooks per User\")\nplt.ylabel(\"Number of Users\")\nplt.xlim(0, 4000)\nplt.yscale('log')\nplt.ylim(1, 100000)\n\nplt.subplot(2, 1, 2)\nplt.hist(\n org_nbs_per_owner.file,\n bins=400,\n color='teal'\n)\nplt.title(\"Notebooks per Organization\")\nplt.xlabel(\"Number of Notebooks\")\nplt.ylabel(\"Number of Orgs\")\nplt.xlim(0, 4000)\nplt.yscale('log')\nplt.ylim(1, 100000)\n\nplt.tight_layout()\nplt.show()\n\n\n# Both distributions are centered with a median at 4, but the range of number of notebooks is much wider for organizations and the average is much higher at 19.03 as opposed to 10.88.\n\n# ## Educational owners versus non-educational owners\n\n#%%\n\nprint(\"We've determined that {0}% of repos with descriptions (containing {1}% of notebooks in repos with descriptions) are educational.\".format(\n round(100*len(edu_repos)/len(repos_has_desc), 2),\n round(100*len(edu_notebooks) /\n len(notebooks[notebooks.repo_id.isin(repos_has_desc.repo_id)]), 3)\n))\n\n#%%\n\nnum_edu_users = type_counts.owner_type.value_counts()['Educational']\nnum_non_edu_users = type_counts.owner_type.value_counts()['Non-Educational']\ntotal_users = type_counts.owner_type.value_counts().sum()\n\nprint(\"{0}% of owners with at least one repo description are definitively educational or non-educational.\".format(\n round(100*(num_edu_users + num_non_edu_users)/total_users, 2)\n))\nprint(\"The other {0}% have equal numbers of educational and non-educational repositories\".format(\n round(100-100*(num_edu_users + num_non_edu_users)/total_users, 2)\n))\nprint(\"\\nOf owners that are not split, {0}% are educational and {1}% are not.\".format(\n round(100*(num_edu_users)/(num_edu_users + num_non_edu_users), 2),\n round(100*(num_non_edu_users)/(num_edu_users + num_non_edu_users), 2)\n\n))\n\nnum_org_users = sum(type_counts.type == 'Organization')\nnum_indiv_users = sum(type_counts.type == 'User')\nprint(\"{0}% of individual users are educational while {1}% of organizational users are educational.\".format(\n round(100*sum(np.logical_and(type_counts.type == 'User',\n type_counts.owner_type == 'Educational'))/num_indiv_users, 2),\n round(100*sum(np.logical_and(type_counts.type == 'Organization',\n type_counts.owner_type == 'Educational'))/num_org_users, 2),\n\n))\n\n\n# ## Notebooks per Owner: educational vs not\n\n#%%\n\nedu_nbs = notebooks[notebooks.repo_id.isin(edu_repos.repo_id)]\nnon_edu_nbs = notebooks[notebooks.repo_id.isin(non_edu_repos.repo_id)]\n\nprint(\"For Educational Owners:\")\nedu_nbs_per_owner = edu_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of educational owners only have one notebook on GitHub.\".format(\n round(100*sum(edu_nbs_per_owner.file == 1)/len(edu_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(edu_nbs_per_owner.file > 10)/len(edu_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(edu_nbs_per_owner.file.mean(), 2),\n edu_nbs_per_owner.file.median()\n))\n\ntop_edu = owners[\n owners.owner_id == edu_nbs_per_owner[\n edu_nbs_per_owner.file == edu_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top educational owner: {0} with {1} notebooks.'.format(\n top_edu, edu_nbs_per_owner.file.max()))\n\nprint(\"\\nFor Non-Educational Owners:\")\nnon_edu_nbs_per_owner = non_edu_nbs.groupby(\n 'owner_id')['file'].count().reset_index()\nprint(\"{0}% of non-educational owners only have one notebook on GitHub.\".format(\n round(100*sum(non_edu_nbs_per_owner.file == 1)/len(non_edu_nbs_per_owner), 2)\n))\nprint(\"{0}% of non-educational owners have over 10 notebooks on GitHub\".format(\n round(100*sum(non_edu_nbs_per_owner.file > 10)/len(non_edu_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(non_edu_nbs_per_owner.file.mean(), 2),\n non_edu_nbs_per_owner.file.median()\n))\n\ntop_non_edu = owners[\n owners.owner_id == non_edu_nbs_per_owner[\n non_edu_nbs_per_owner.file == non_edu_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top non-educational owner: {0} with {1} notebooks.'.format(\n top_non_edu, non_edu_nbs_per_owner.file.max()))", "target_code": "nbs_per_repo = notebooks.groupby('repo_id')['file'].count(\n).reset_index().rename(columns={'file': 'num_nbs'})\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# [Language Use <](2_Languages.ipynb) | [> Notebook Organization](4_Organization.ipynb)\n\n# # Owners\n# This notebook looks at what types of owners (user, organization, educational, personal, and enterprise) have public notebooks on GitHub.\n#\n# ## Results Summary:\n# - 4.72% of all owners are organizations; they own 7.86% of notebooks.\n# - 25.82% of notebook owners only have one notebook on GitHub. 23.89% of notebook owners have over 10 notebooks on GitHub\n# - 54.13% of repositories have a description.\n# - Users have an average of 10.88 notebooks on GitHub (Median = 4). Organizations have an average of 19.03 notebooks on GitHub (Median = 4).\n# - 23.34% of repos with descriptions are educational. These repositories hold 28.564% of notebooks.\n# - 16.27% of individual users have primarily educational repositories while 18.12% of organizational users have primarily educational repositories.\n# - Educational owners tend to host more notebooks (mean 9.67, median 4) compared to non-educational owners (mean 8.61, median 3)\n\n# ----\n\n# # Import Packages & Load Data\n\n\nfrom nltk.corpus import stopwords\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport re\nimport numpy as np\n\nimport load_data\nimport datetime\nimport pickle\n\n\nnotebooks = load_data.load_notebooks()\nrepos = load_data.load_repos()\nowners = load_data.load_owners()\n\n\n# --------\n\n# # Manipulate Data\n\n# ## Separate types of owners: organizations vs users\n\n\nusers = owners[owners.type == 'User']\norgs = owners[owners.type == 'Organization']\n\n\nuser_nbs = notebooks[notebooks.owner_id.isin(users.owner_id)]\norg_nbs = notebooks[notebooks.owner_id.isin(orgs.owner_id)]\n\n\n# ## Identify educational owners\n# Define educational owners as owners whos repositories are primarily educational.\n# Define an educational repository as one that uses at least one of the educational words listed below. We are only considering repositories with descriptions as eligible.\n\n# ### Isolate educational repositories\n\n\nedu_words = [\n 'teach', 'bootcamp', 'boot camp', 'capstone',\n 'final project', 'thesis', 'demonstrat', 'workshop',\n # space to avoid matching 'classification'\n 'lesson', 'course', 'classroom', 'study', 'class ',\n 'research', 'university', 'curriculum', 'udemy',\n 'udacity', 'coursea', 'mit ', 'exercise',\n 'example', 'instruction', 'tutorial', 'education',\n 'student', 'assignment', 'homework', 'problem set', 'lecture'\n]\n\n\n# Limit to repositories with descriptions.\n\n\nrepos_has_desc = repos.copy()[~repos.repo_description.isna()]\nprint(\"{0}% of repositories have a description.\".format(round(\n 100*len(repos_has_desc) / len(repos), 2\n)))\n\n\n# Separate repositories with at least one educational word from those without.\n\n\nrepos_has_desc['edu_word_count'] = [\n sum([1 if\n re.search('(?= 1]\nnon_edu_repos = repos_has_desc.copy()[repos_has_desc['edu_word_count'] == 0]\n\nedu_notebooks = notebooks.copy()[notebooks.repo_id.isin(edu_repos.repo_id)]\nnon_edu_notebooks = notebooks.copy(\n)[notebooks.repo_id.isin(non_edu_repos.repo_id)]\n\n\n# ### Visually inspect a sample of educational repos. Did we get them right?\n#\n# Most of these descriptions seem to represent educational repositories, held by students and instructors alike.\n#\n# These ones don't seem to fit: \"How to handle emoji in Python + a quick Python script to count emoji in Tweets as an example. (python 2.7)\", \"This is a webcrawler which gets lists of articles from Korea University Library site.\"\n\n\nprint(\n edu_repos['repo_description'].sample(\n 10, random_state=50\n ).values\n)\n\n\n# ### Visually inspect a sample of non-educational repos\n#\n# Some repos are described in other languages, so this simple word-based check did not caputre them.\n# - \"\u548cPython\u76f8\u5173\u7684\u5b66\u4e60\u7b14\u8bb0\uff1a\u673a\u5668\u5b66\u4e60\u3001\u7b97\u6cd5\u3001\u8fdb\u9636\u4e66\u7c4d\u3001\u6587\u6863\uff0c\u535a\u5ba2\u5730\u5740\"\n# - translation: \"Python-related study notes: machine learning, algorithms, advanced books, documentation\"\n#\n# Other repos may be educational, but not for sure.\n# - \"Intro to Biomedical Computational Programming\"\n# - \"AM221 PSet\"\n# - \"Experimental physics modules III, IV & V - Thimo Preis\"\n\n\nprint(\n non_edu_repos['repo_description'].sample(\n 10, random_state=50\n ).values\n)\n\n\nrepo_edu_status = pd.DataFrame({\n 'repo_id': list(edu_repos.repo_id) + list(non_edu_repos.repo_id),\n 'edu': [True]*len(edu_repos) + [False]*len(non_edu_repos)\n})\n\n\nf = open('analysis_data/repo_edu_status.df', 'wb')\npickle.dump(repo_edu_status, f)\nf.close()\n\n\n# ### Most common words within each group of repositories\n\n\nstop = set(stopwords.words('english'))\n\n\nprint(\"Most common words in descriptions of educational repositories\")\nprint(\"and the proportion of descriptions they appear in\")\nprint(\"(excluding educational words we selected for)\")\n(pd.Series([\n w.lower() for w in ' '.join(edu_repos['repo_description']).split()\n if w.lower() not in stop and sum([w.lower() in e or e in w.lower() for e in edu_words]) == 0 and len(w) > 2\n]).value_counts()/len(edu_repos))[:10]\n\n\nprint(\"Most common words in descriptions of non-educational repositories\")\n(pd.Series([\n w.lower() for w in ' '.join(non_edu_repos['repo_description']).split()\n if w.lower() not in stop and len(w) > 2\n]).value_counts()/len(non_edu_repos))[:10]\n\n\n# ### Add repository type to repos dataframe\n\n\nedu_repos['type'] = 'Educational'\nnon_edu_repos['type'] = 'Non-Educational'\nrepos = pd.concat([edu_repos, non_edu_repos])\n\n\n# ### Determine if *owners* are educational or not\n\n\ntype_counts = repos.groupby(\n ['owner_id', 'type']\n)['repo_id'].count().reset_index().pivot(\n index='owner_id',\n columns='type',\n values='repo_id'\n).reset_index().fillna(0)\n\n\ntype_counts['owner_type'] = [\n 'Educational' if r['Educational'] > r['Non-Educational']\n else ('Non-Educational' if r['Educational'] < r['Non-Educational']\n else '')\n for _, r in type_counts.iterrows()\n]\n\n\ntype_counts = type_counts.merge(owners[['owner_id', 'type']], on='owner_id')\ntype_counts.head()\n\n\n# ---------\n\n# # Visualizations & Statistics\n\n# ## Owner types\n\n\nprop_user = len(user_nbs) / (len(user_nbs) + len(org_nbs))\nprop_org = len(org_nbs) / (len(user_nbs) + len(org_nbs))\nprint(\"{0:,} ({1}%) of owners are organizations.\".format(\n len(orgs), round(100*len(orgs)/(len(orgs) + len(users)), 2),\n))\nprint(\"{0}% of notebooks are owned by organizations, while {1}% are owned by users.\".format(\n round(100*prop_org, 2), round(100*prop_user, 2)\n))\n\n\n# ## Notebooks per Owner\n\n\nnbs_per_owner = notebooks.groupby('owner_id')['file'].count().reset_index()\n\n\nplt.hist(nbs_per_owner.file, bins=500, color='teal')\nplt.xlim(0, 1500)\nplt.yscale('log')\nplt.xlabel('Notebooks')\nplt.ylabel('Owners')\nplt.title('Notebooks per Owner')\nplt.show()\n\n\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(nbs_per_owner.file == 1)/len(nbs_per_owner), 2)\n))\n\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(nbs_per_owner.file > 10)/len(nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}, Range = {2}-{3}\".format(\n round(nbs_per_owner.file.mean(), 2),\n nbs_per_owner.file.median(),\n nbs_per_owner.file.min(),\n nbs_per_owner.file.max()\n))\n\n\n# ## Notebooks per Owner: users vs organizations\n\n\nprint(\"For Users:\")\nuser_nbs_per_owner = user_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(user_nbs_per_owner.file == 1)/len(user_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(user_nbs_per_owner.file > 10)/len(user_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(user_nbs_per_owner.file.mean(), 2),\n user_nbs_per_owner.file.median()\n))\n\ntop_owner = owners[\n owners.owner_id == user_nbs_per_owner[\n user_nbs_per_owner.file == user_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top user: {0} with {1} notebooks.'.format(\n top_owner, user_nbs_per_owner.file.max()))\n\nprint(\"\\nFor Organizations:\")\norg_nbs_per_owner = org_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of notebook owners only have one notebook on GitHub.\".format(\n round(100*sum(org_nbs_per_owner.file == 1)/len(org_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(org_nbs_per_owner.file > 10)/len(org_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(org_nbs_per_owner.file.mean(), 2),\n org_nbs_per_owner.file.median()\n))\n\ntop_org = owners[\n owners.owner_id == org_nbs_per_owner[\n org_nbs_per_owner.file == org_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top organization: {0} with {1} notebooks.'.format(\n top_org, org_nbs_per_owner.file.max()))\n\n\nfig = plt.figure(figsize=(10, 5))\n\nplt.subplot(2, 1, 1)\nplt.hist(\n user_nbs_per_owner.file,\n bins=50,\n color='teal'\n)\nplt.title(\"Notebooks per User\")\nplt.ylabel(\"Number of Users\")\nplt.xlim(0, 4000)\nplt.yscale('log')\nplt.ylim(1, 100000)\n\nplt.subplot(2, 1, 2)\nplt.hist(\n org_nbs_per_owner.file,\n bins=400,\n color='teal'\n)\nplt.title(\"Notebooks per Organization\")\nplt.xlabel(\"Number of Notebooks\")\nplt.ylabel(\"Number of Orgs\")\nplt.xlim(0, 4000)\nplt.yscale('log')\nplt.ylim(1, 100000)\n\nplt.tight_layout()\nplt.show()\n\n\n# Both distributions are centered with a median at 4, but the range of number of notebooks is much wider for organizations and the average is much higher at 19.03 as opposed to 10.88.\n\n# ## Educational owners versus non-educational owners\n\n\nprint(\"We've determined that {0}% of repos with descriptions (containing {1}% of notebooks in repos with descriptions) are educational.\".format(\n round(100*len(edu_repos)/len(repos_has_desc), 2),\n round(100*len(edu_notebooks) /\n len(notebooks[notebooks.repo_id.isin(repos_has_desc.repo_id)]), 3)\n))\n\n\nnum_edu_users = type_counts.owner_type.value_counts()['Educational']\nnum_non_edu_users = type_counts.owner_type.value_counts()['Non-Educational']\ntotal_users = type_counts.owner_type.value_counts().sum()\n\nprint(\"{0}% of owners with at least one repo description are definitively educational or non-educational.\".format(\n round(100*(num_edu_users + num_non_edu_users)/total_users, 2)\n))\nprint(\"The other {0}% have equal numbers of educational and non-educational repositories\".format(\n round(100-100*(num_edu_users + num_non_edu_users)/total_users, 2)\n))\nprint(\"\\nOf owners that are not split, {0}% are educational and {1}% are not.\".format(\n round(100*(num_edu_users)/(num_edu_users + num_non_edu_users), 2),\n round(100*(num_non_edu_users)/(num_edu_users + num_non_edu_users), 2)\n\n))\n\nnum_org_users = sum(type_counts.type == 'Organization')\nnum_indiv_users = sum(type_counts.type == 'User')\nprint(\"{0}% of individual users are educational while {1}% of organizational users are educational.\".format(\n round(100*sum(np.logical_and(type_counts.type == 'User',\n type_counts.owner_type == 'Educational'))/num_indiv_users, 2),\n round(100*sum(np.logical_and(type_counts.type == 'Organization',\n type_counts.owner_type == 'Educational'))/num_org_users, 2),\n\n))\n\n\n# ## Notebooks per Owner: educational vs not\n\n\nedu_nbs = notebooks[notebooks.repo_id.isin(edu_repos.repo_id)]\nnon_edu_nbs = notebooks[notebooks.repo_id.isin(non_edu_repos.repo_id)]\n\nprint(\"For Educational Owners:\")\nedu_nbs_per_owner = edu_nbs.groupby('owner_id')['file'].count().reset_index()\nprint(\"{0}% of educational owners only have one notebook on GitHub.\".format(\n round(100*sum(edu_nbs_per_owner.file == 1)/len(edu_nbs_per_owner), 2)\n))\nprint(\"{0}% of notebook owners have over 10 notebooks on GitHub\".format(\n round(100*sum(edu_nbs_per_owner.file > 10)/len(edu_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(edu_nbs_per_owner.file.mean(), 2),\n edu_nbs_per_owner.file.median()\n))\n\ntop_edu = owners[\n owners.owner_id == edu_nbs_per_owner[\n edu_nbs_per_owner.file == edu_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top educational owner: {0} with {1} notebooks.'.format(\n top_edu, edu_nbs_per_owner.file.max()))\n\nprint(\"\\nFor Non-Educational Owners:\")\nnon_edu_nbs_per_owner = non_edu_nbs.groupby(\n 'owner_id')['file'].count().reset_index()\nprint(\"{0}% of non-educational owners only have one notebook on GitHub.\".format(\n round(100*sum(non_edu_nbs_per_owner.file == 1)/len(non_edu_nbs_per_owner), 2)\n))\nprint(\"{0}% of non-educational owners have over 10 notebooks on GitHub\".format(\n round(100*sum(non_edu_nbs_per_owner.file > 10)/len(non_edu_nbs_per_owner), 2)\n))\nprint(\"Mean = {0}, Median = {1}\".format(\n round(non_edu_nbs_per_owner.file.mean(), 2),\n non_edu_nbs_per_owner.file.median()\n))\n\ntop_non_edu = owners[\n owners.owner_id == non_edu_nbs_per_owner[\n non_edu_nbs_per_owner.file == non_edu_nbs_per_owner.file.max()\n ].owner_id.values[0]\n].owner_html_url.values[0]\nprint('Top non-educational owner: {0} with {1} notebooks.'.format(\n top_non_edu, non_edu_nbs_per_owner.file.max()))\n\n\n\n", "project_metadata": {"full_name": "jupyter-resources/notebook-research", "description": "Research on the usage of Jupyter notebooks", "topics": [], "git_url": "git://github.com/jupyter-resources/notebook-research.git", "stars": 13, "watchers": 13, "forks": 5, "created": "2019-07-17T16:39:45Z", "size": 12004, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2082863, "Python": 180881}, "last_updated": "2020-07-04T11:49:14Z"}, "intent": "# Number of notebooks per repository"}, {"original_comment": "# appyling converter to change sentiments from 3 to 2-classed\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Binary-Classification with ML Models\n\n# ## Aim of This Notebook\n\n# To classify reviews as positive of negative easily helps company to see negative reviewed products easily without reading too much reviews. So, my focus point is negative rated products. So, I divided my target as 0 and 1. 3 and above rated books were collected together as 1 and 1-2 were classified as 0.\n#\n# My aim in this notebook is to predict reviews as positive or negative from text. To do this, I used machine learning algoritms. Also, deep learning solutions can be found in same repo.\n#\n# ### Metric:\n#\n# As metric, I will use balanced accuracy values, but I will also look to confusion matrix to decide because my concern is to predict 0 (negative) reviews more accurate.\n\n# ## Importing Necessary Libraries\n\n#%%\n\n# dataframe and series\nfrom lightgbm import LGBMClassifier\nimport lightgbm as lgb\nfrom xgboost import XGBClassifier\nimport xgboost as xgb\nfrom nltk.corpus import stopwords\nimport nltk\nimport seaborn as sns\nimport matplotlib as mpl\nimport pandas as pd\nimport numpy as np\n\n# sklearn imports for modeling part\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import accuracy_score, balanced_accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom mlxtend.evaluate import confusion_matrix\nfrom mlxtend.plotting import plot_confusion_matrix\nfrom mlxtend.plotting import plot_decision_regions\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import roc_auc_score\n# To plot\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# XGBoost and LGBM classifier imports\n\n#%%\n\npd.options.display.max_columns = 100 # To see the hidden columns in dataframe\n\n#%%\n\n# taking cleaned data from csv\ndf = pd.read_csv('cleaned_data.csv', low_memory=False)\n\n#%%\n\ndf.head()\n\n\n# I have already cleaned my data but after converting my data to new column, I would like to make sure my new column is clean or not.\n\n#%%\n\ndf.isna().sum() # to check cleaned column\n\n#%%\n\n# droping null's in review clean\ndf.dropna(subset=['review_clean'], inplace=True)\n\n#%%\n\ndf.isna().sum() # to check\n\n\n# ## Taking Samples for Modeling\n\n# My target is highly unbalanced. To teach my model more about minority class, I will take sample data from each classes balanced.\n\n#%%\n\ndef calc_two_sentiment(overall):\n '''This function encodes the rating 1 and 2 as 0, others as 1'''\n if overall >= 3:\n return 1\n else:\n return 0\n\n#%%", "target_code": "df['sentiment'] = df['overall'].apply(calc_two_sentiment)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Binary-Classification with ML Models\n\n# ## Aim of This Notebook\n\n# To classify reviews as positive of negative easily helps company to see negative reviewed products easily without reading too much reviews. So, my focus point is negative rated products. So, I divided my target as 0 and 1. 3 and above rated books were collected together as 1 and 1-2 were classified as 0.\n#\n# My aim in this notebook is to predict reviews as positive or negative from text. To do this, I used machine learning algoritms. Also, deep learning solutions can be found in same repo.\n#\n# ### Metric:\n#\n# As metric, I will use balanced accuracy values, but I will also look to confusion matrix to decide because my concern is to predict 0 (negative) reviews more accurate.\n\n# ## Importing Necessary Libraries\n\n\n# dataframe and series\nfrom lightgbm import LGBMClassifier\nimport lightgbm as lgb\nfrom xgboost import XGBClassifier\nimport xgboost as xgb\nfrom nltk.corpus import stopwords\nimport nltk\nimport seaborn as sns\nimport matplotlib as mpl\nimport pandas as pd\nimport numpy as np\n\n# sklearn imports for modeling part\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import accuracy_score, balanced_accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom mlxtend.evaluate import confusion_matrix\nfrom mlxtend.plotting import plot_confusion_matrix\nfrom mlxtend.plotting import plot_decision_regions\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import roc_auc_score\n# To plot\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# XGBoost and LGBM classifier imports\n\n\npd.options.display.max_columns = 100 # To see the hidden columns in dataframe\n\n\n# taking cleaned data from csv\ndf = pd.read_csv('cleaned_data.csv', low_memory=False)\n\n\ndf.head()\n\n\n# I have already cleaned my data but after converting my data to new column, I would like to make sure my new column is clean or not.\n\n\ndf.isna().sum() # to check cleaned column\n\n\n# droping null's in review clean\ndf.dropna(subset=['review_clean'], inplace=True)\n\n\ndf.isna().sum() # to check\n\n\n# ## Taking Samples for Modeling\n\n# My target is highly unbalanced. To teach my model more about minority class, I will take sample data from each classes balanced.\n\n\ndef calc_two_sentiment(overall):\n '''This function encodes the rating 1 and 2 as 0, others as 1'''\n if overall >= 3:\n return 1\n else:\n return 0\n\n", "project_metadata": {"full_name": "ezgigm/sentiment_analysis_and_product_recommendation", "description": "From the Kindle Store Reviews on Amazon, sentiment analysis and book recommendation. Used Keras, FastText from Torch, and BERT. For recommender systems; SVDS, cosine-similarity, and solved the cold-start problem.", "topics": [], "git_url": "git://github.com/ezgigm/sentiment_analysis_and_product_recommendation.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-05-20T19:38:27Z", "size": 3636, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3652478}, "last_updated": "2020-12-11T08:27:30Z"}, "intent": "# appyling converter to change sentiments from 3 to 2-classed"}, {"original_comment": "# changing the cast column from json to string\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## TMDB Score Prediction and Movie Recommendation\n#\n# ![](https://midasmedia.co.uk//media/2014/03/quality-score-grades-1-10.png)\n#\n# This notebook will be completely different from my previous notebooks which were completely EDA's(my favorite :p). In this notebook I will try to predict the TMDB score for a movie. Initially I was planning to do EDA with this dataset, but one of my friend and fellow Kaggler **[Mhamed Jabri](https://www.kaggle.com/mhajabri/)** recommended me to do and try something different, and I would like to thank him a lot as it helped me learn new things.\n#\n# Coming to the dataset, it has informatio about 5000 movies, split into 2 csv files.\n#\n# - ** *tmdb_5000_movies.csv:* ** Contains information like the **score,title,date_of_release,genres,etc.**\n#\n# - ** *tmdb_5000_credits.csv:* ** Contains information of the **cast and crew** for each movie.\n#\n# The main problem with this dataset is the **.json format**. Many columns in the dataset are in json format, therefore cleaning the dataset was the main challenge. For people who don't know about JSON(JavaScript Object Notation), it is basically a syntax for storing and exchanging data between two computers. It is mainly in a **key:value** format, and is embedded into a string.\n#\n# In this notebook, I will try to predict the score or the rating of a movie. We know that the score of a movie depends on various factors like the genre, or the actor working in the film and mainly the director of the film. Considering all such factors, I will try to build a simple score predictor for this dataset.\n#\n# I hope this notebook helps other fellow kagglers in learning something new. **Do Upvote** is you find this notebook useful.\n\n# ### Import Required Packges\n\n#%%\n\nimport operator\nfrom scipy import spatial\nfrom nltk.corpus import stopwords\nimport nltk\nfrom wordcloud import WordCloud, STOPWORDS\nfrom IPython.display import HTML\nimport codecs\nfrom scipy.misc import imread\nimport io\nimport base64\nimport warnings\nimport json\nimport numpy as np\nimport seaborn as sns\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.style.use('fivethirtyeight')\nwarnings.filterwarnings('ignore')\n\n\n# ### Importing the Data\n\n#%%\n\nmovies = pd.read_csv('../input/tmdb_5000_movies.csv')\nmov = pd.read_csv('../input/tmdb_5000_credits.csv')\n\n#%%\n\nmovies.head(3)\n\n\n# Checking the dataset, we can see that **genres,keywords,production_companies,production_countries,spoken_languages** are in the json format. Similarly in the other csv file, **cast and crew** are in the json format. Now lets convert these columns into a format that can be easily read and interpreted. We will convert them into strings and later convert them into lists for easier interpretation.\n#\n# As stated earlier ,the JSON format is like a **dictionary(key:value)** pair embedded in a string. Now parsing the data is a pain and time consuming. Luckily this dataset doesn't have that complicated structure. A basic similarity between the columns is that they have a **name** key, which contains the values that we need to collect. The easiest way to do so parse through the JSON and check for the name key on each row. Once the name key is found, store the value of it into a list and replace the JSON with the list.\n#\n# But we cannot directly parse this JSON as it has to decoded first. For this we use the **json.loads()** method, which decodes it into a list. We can then parse through this list to find the desired values. Lets look at the proper syntax below.\n\n# #### Converting the json into strings\n\n#%%\n\n# changing the genres column from json to string\nmovies['genres'] = movies['genres'].apply(json.loads)\nfor index, i in zip(movies.index, movies['genres']):\n list1 = []\n for j in range(len(i)):\n # the key 'name' contains the name of the genre\n list1.append((i[j]['name']))\n movies.loc[index, 'genres'] = str(list1)\n\n# changing the keywords column from json to string\nmovies['keywords'] = movies['keywords'].apply(json.loads)\nfor index, i in zip(movies.index, movies['keywords']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'keywords'] = str(list1)\n\n# changing the production_companies column from json to string\nmovies['production_companies'] = movies['production_companies'].apply(\n json.loads)\nfor index, i in zip(movies.index, movies['production_companies']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'production_companies'] = str(list1)\n\n# changing the production_countries column from json to string\nmovies['production_countries'] = movies['production_countries'].apply(\n json.loads)\nfor index, i in zip(movies.index, movies['production_countries']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'production_countries'] = str(list1)", "target_code": "mov['cast'] = mov['cast'].apply(json.loads)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## TMDB Score Prediction and Movie Recommendation\n#\n# ![](https://midasmedia.co.uk//media/2014/03/quality-score-grades-1-10.png)\n#\n# This notebook will be completely different from my previous notebooks which were completely EDA's(my favorite :p). In this notebook I will try to predict the TMDB score for a movie. Initially I was planning to do EDA with this dataset, but one of my friend and fellow Kaggler **[Mhamed Jabri](https://www.kaggle.com/mhajabri/)** recommended me to do and try something different, and I would like to thank him a lot as it helped me learn new things.\n#\n# Coming to the dataset, it has informatio about 5000 movies, split into 2 csv files.\n#\n# - ** *tmdb_5000_movies.csv:* ** Contains information like the **score,title,date_of_release,genres,etc.**\n#\n# - ** *tmdb_5000_credits.csv:* ** Contains information of the **cast and crew** for each movie.\n#\n# The main problem with this dataset is the **.json format**. Many columns in the dataset are in json format, therefore cleaning the dataset was the main challenge. For people who don't know about JSON(JavaScript Object Notation), it is basically a syntax for storing and exchanging data between two computers. It is mainly in a **key:value** format, and is embedded into a string.\n#\n# In this notebook, I will try to predict the score or the rating of a movie. We know that the score of a movie depends on various factors like the genre, or the actor working in the film and mainly the director of the film. Considering all such factors, I will try to build a simple score predictor for this dataset.\n#\n# I hope this notebook helps other fellow kagglers in learning something new. **Do Upvote** is you find this notebook useful.\n\n# ### Import Required Packges\n\n\nimport operator\nfrom scipy import spatial\nfrom nltk.corpus import stopwords\nimport nltk\nfrom wordcloud import WordCloud, STOPWORDS\nfrom IPython.display import HTML\nimport codecs\nfrom scipy.misc import imread\nimport io\nimport base64\nimport warnings\nimport json\nimport numpy as np\nimport seaborn as sns\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.style.use('fivethirtyeight')\nwarnings.filterwarnings('ignore')\n\n\n# ### Importing the Data\n\n\nmovies = pd.read_csv('../input/tmdb_5000_movies.csv')\nmov = pd.read_csv('../input/tmdb_5000_credits.csv')\n\n\nmovies.head(3)\n\n\n# Checking the dataset, we can see that **genres,keywords,production_companies,production_countries,spoken_languages** are in the json format. Similarly in the other csv file, **cast and crew** are in the json format. Now lets convert these columns into a format that can be easily read and interpreted. We will convert them into strings and later convert them into lists for easier interpretation.\n#\n# As stated earlier ,the JSON format is like a **dictionary(key:value)** pair embedded in a string. Now parsing the data is a pain and time consuming. Luckily this dataset doesn't have that complicated structure. A basic similarity between the columns is that they have a **name** key, which contains the values that we need to collect. The easiest way to do so parse through the JSON and check for the name key on each row. Once the name key is found, store the value of it into a list and replace the JSON with the list.\n#\n# But we cannot directly parse this JSON as it has to decoded first. For this we use the **json.loads()** method, which decodes it into a list. We can then parse through this list to find the desired values. Lets look at the proper syntax below.\n\n# #### Converting the json into strings\n\n\n# changing the genres column from json to string\nmovies['genres'] = movies['genres'].apply(json.loads)\nfor index, i in zip(movies.index, movies['genres']):\n list1 = []\n for j in range(len(i)):\n # the key 'name' contains the name of the genre\n list1.append((i[j]['name']))\n movies.loc[index, 'genres'] = str(list1)\n\n# changing the keywords column from json to string\nmovies['keywords'] = movies['keywords'].apply(json.loads)\nfor index, i in zip(movies.index, movies['keywords']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'keywords'] = str(list1)\n\n# changing the production_companies column from json to string\nmovies['production_companies'] = movies['production_companies'].apply(\n json.loads)\nfor index, i in zip(movies.index, movies['production_companies']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'production_companies'] = str(list1)\n\n# changing the production_countries column from json to string\nmovies['production_countries'] = movies['production_countries'].apply(\n json.loads)\nfor index, i in zip(movies.index, movies['production_countries']):\n list1 = []\n for j in range(len(i)):\n list1.append((i[j]['name']))\n movies.loc[index, 'production_countries'] = str(list1)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# change cast column from json to string"}, {"original_comment": "# Predict the Response for both Train and Test\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Python Programming\n#\n# **Chapter 7 : Basic Data Science with Python**\n#\n# Python is a fun language to learn, and really easy to pick up even if you are new to programming. In fact, quite often, Python is easier to pick up if you do not have any programming experience whatsoever. Python is high level programming language, targeted at students and professionals from diverse backgrounds.\n#\n# In this chapter, we will cover\n# - Essential Libraries\n# - Case Study : Linear Regression\n# - Case Study : Classification\n#\n# **License Declaration** : Following the lead from the inspirations for this material, and the *spirit* of Python education and development, all modules of this work are licensed under the Creative Commons Attribution 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/.\n#\n# ---\n\n# ## Essential Libraries\n#\n# Let us begin by importing the essential Python Libraries.\n# You may install any library using `conda install `.\n# Most of the libraries come by default with the Anaconda platform.\n#\n# > NumPy : Library for Numeric Computations in Python\n# > Pandas : Library for Data Acquisition and Preparation\n# > Matplotlib : Low-level library for Data Visualization\n# > Seaborn : Higher-level library for Data Visualization\n\n#%%\n\n# Import Libraries\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.tree import plot_tree\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nimport numpy as np\nimport pandas as pd\nimport seaborn as sb\nimport matplotlib.pyplot as plt # we only need pyplot\nsb.set() # set the default Seaborn style for graphics\n\n\n# We will also need the most common Python libraries for (basic) Machine Learning.\n# Scikit-Learn (`sklearn`) will be our de-facto Machine Learning library in Python.\n#\n# **Linear Regression**\n# > `LinearRegression` model from `sklearn.linear_model` : Our main model for Regression\n# > `mean_squared_error` metric from `sklearn.metrics` : Performance metric for Regression\n#\n# **Classification Tree**\n# > `DecisionTreeClassifier` model from `sklearn.tree` : Our main model for Classification\n# > `plot_tree` method from `sklearn.tree` : Function to clearly visualize a Classification Tree\n# > `confusion_matrix` metric from `sklearn.metrics` : Performance metric for Classification\n#\n# *Common Functionality*\n# > `train_test_split` method from `sklearn.model_selection` : Random Train-Test splits\n\n#%%\n\n# Import essential models and functions from sklearn\n\n# Linear Regression\n\n# Classification Tree\n\n# Common Functionality\n\n#%%\n\n# ---\n#\n# ## Case Study : Linear Regression\n#\n# We use the **\"Pokemon with stats\"** dataset from Kaggle, curated by *Alberto Barradas* (https://www.kaggle.com/abcsds/pokemon).\n#\n# ### Import the Dataset\n#\n# The dataset is in CSV format; hence we use the `read_csv` function from Pandas.\n# Immediately after importing, take a quick look at the data using the `head` function.\n\n#%%\n\n# Read the CSV Data\npkmndata = pd.read_csv('files/pokemonData.csv')\npkmndata.head()\n\n\n# Check the vital statistics of the dataset using the `type` and `shape` attributes.\n# Check the variables (and their types) in the dataset using the `info()` method.\n\n#%%\n\nprint(\"Data type : \", type(pkmndata))\nprint(\"Data dims : \", pkmndata.shape)\nprint()\npkmndata.info()\n\n#%%\n\n# ### Relationship between Numeric Variables\n#\n# Check the mutual relationship between the numeric variables using Correlation and Jointplots.\n\n#%%\n\n# Extract only the numeric data variables\nnumDF = pd.DataFrame(\n pkmndata[[\"Total\", \"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]])\n\n# Correlation Matrix\nprint(numDF.corr())\n\n# Heatmap of the Correlation Matrix\nf, axes = plt.subplots(1, 1, figsize=(18, 12))\nsb.heatmap(numDF.corr(), vmin=-1, vmax=1, annot=True,\n fmt=\".2f\", annot_kws={\"size\": 18}, cmap=\"RdBu\")\n\n#%%\n\n# Draw pairs of variables against one another\nsb.pairplot(data=numDF)\n\n#%%\n\n# ### Uni-Variate Regression\n#\n# We will start by setting up a Uni-Variate Linear Regression problem.\n#\n# > Regression Model : Response = $a$ $\\times$ Predictor + $b$\n#\n# Check the mutual relationship between the variables to start with.\n\n#%%\n\n# Set up the problem with Predictor(s) and Response\npredictor = \"HP\"\nresponse = \"Total\"\n\n# 2D scatterplot of two variables to observe their relationship\nf = plt.figure(figsize=(16, 8))\nsb.scatterplot(x=predictor, y=response, data=pkmndata)\n\n\n# Extract the Response and Predictor variables as two individual Pandas `DataFrame`.\n\n#%%\n\n# Extract Response and Predictors\ny = pd.DataFrame(pkmndata[response])\nX = pd.DataFrame(pkmndata[predictor])\n\n\n# Split the dataset randomly into Train and Test datasets using `train_test_split`.\n\n#%%\n\n# Split the Dataset into Train and Test\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n\n\n# `LinearRegression` is a class for the regression model in `sklearn`.\n# We need to create an object of the `LinearRegression` class, as follows.\n\n#%%\n\n# Create a Linear Regression object\nlinreg = LinearRegression()\n\n\n# Train the Linear Regression model using the Train Set `X_train` and `y_train`.\n\n#%%\n\n# Train the Linear Regression model\nlinreg.fit(X_train, y_train)\n\n\n# You have *trained* the model to fit the following formula.\n#\n# > Regression Problem : Response = $a$ $\\times$ Predictor + $b$\n#\n# Check Intercept ($b$) and Coefficient ($a$) of the regression line.\n\n#%%\n\n# Coefficients of the Linear Regression line\nprint('Intercept \\t b = ', linreg.intercept_)\nprint('Coefficients \\t a = ', linreg.coef_)\n\n#%%\n\n# Predict the response variable using the model you just trained.\n\n#%%\n\n# Predict the Response on the Train Set\ny_train_pred = linreg.predict(X_train)\n\n# Plot the Linear Regression line\nf = plt.figure(figsize=(16, 8))\nplt.scatter(X_train, y_train)\nplt.scatter(X_train, y_train_pred, color=\"red\")\nplt.show()\n\n#%%\n\n# Check the *Goodness of Fit* on the Train and Test Sets.\n# Metrics : Explained Variance and Mean Squared Error.\n\n#%%\n\n# Explained Variance (R^2) on Train Set\nprint(\"Explained Variance (R^2) on Train Set \\t\", linreg.score(X_train, y_train))\n\n# Mean Squared Error (MSE) on Train Set\ny_train_pred = linreg.predict(X_train)\nprint(\"Mean Squared Error (MSE) on Train Set \\t\",\n mean_squared_error(y_train, y_train_pred))\n\n# Mean Squared Error (MSE) on Test Set\ny_test_pred = linreg.predict(X_test)\nprint(\"Mean Squared Error (MSE) on Test Set \\t\",\n mean_squared_error(y_test, y_test_pred))\n\n#%%\n\n# It is quite meaningful to check the Predictions against the True values of the Response variable.\n\n#%%", "target_code": "y_train_pred = linreg.predict(X_train)\ny_test_pred = linreg.predict(X_test)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Python Programming\n#\n# **Chapter 7 : Basic Data Science with Python**\n#\n# Python is a fun language to learn, and really easy to pick up even if you are new to programming. In fact, quite often, Python is easier to pick up if you do not have any programming experience whatsoever. Python is high level programming language, targeted at students and professionals from diverse backgrounds.\n#\n# In this chapter, we will cover\n# - Essential Libraries\n# - Case Study : Linear Regression\n# - Case Study : Classification\n#\n# **License Declaration** : Following the lead from the inspirations for this material, and the *spirit* of Python education and development, all modules of this work are licensed under the Creative Commons Attribution 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/.\n#\n# ---\n\n# ## Essential Libraries\n#\n# Let us begin by importing the essential Python Libraries.\n# You may install any library using `conda install `.\n# Most of the libraries come by default with the Anaconda platform.\n#\n# > NumPy : Library for Numeric Computations in Python\n# > Pandas : Library for Data Acquisition and Preparation\n# > Matplotlib : Low-level library for Data Visualization\n# > Seaborn : Higher-level library for Data Visualization\n\n\n# Import Libraries\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.tree import plot_tree\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nimport numpy as np\nimport pandas as pd\nimport seaborn as sb\nimport matplotlib.pyplot as plt # we only need pyplot\nsb.set() # set the default Seaborn style for graphics\n\n\n# We will also need the most common Python libraries for (basic) Machine Learning.\n# Scikit-Learn (`sklearn`) will be our de-facto Machine Learning library in Python.\n#\n# **Linear Regression**\n# > `LinearRegression` model from `sklearn.linear_model` : Our main model for Regression\n# > `mean_squared_error` metric from `sklearn.metrics` : Performance metric for Regression\n#\n# **Classification Tree**\n# > `DecisionTreeClassifier` model from `sklearn.tree` : Our main model for Classification\n# > `plot_tree` method from `sklearn.tree` : Function to clearly visualize a Classification Tree\n# > `confusion_matrix` metric from `sklearn.metrics` : Performance metric for Classification\n#\n# *Common Functionality*\n# > `train_test_split` method from `sklearn.model_selection` : Random Train-Test splits\n\n\n# Import essential models and functions from sklearn\n\n# Linear Regression\n\n# Classification Tree\n\n# Common Functionality\n\n\n# ---\n#\n# ## Case Study : Linear Regression\n#\n# We use the **\"Pokemon with stats\"** dataset from Kaggle, curated by *Alberto Barradas* (https://www.kaggle.com/abcsds/pokemon).\n#\n# ### Import the Dataset\n#\n# The dataset is in CSV format; hence we use the `read_csv` function from Pandas.\n# Immediately after importing, take a quick look at the data using the `head` function.\n\n\n# Read the CSV Data\npkmndata = pd.read_csv('files/pokemonData.csv')\npkmndata.head()\n\n\n# Check the vital statistics of the dataset using the `type` and `shape` attributes.\n# Check the variables (and their types) in the dataset using the `info()` method.\n\n\nprint(\"Data type : \", type(pkmndata))\nprint(\"Data dims : \", pkmndata.shape)\nprint()\npkmndata.info()\n\n\n# ### Relationship between Numeric Variables\n#\n# Check the mutual relationship between the numeric variables using Correlation and Jointplots.\n\n\n# Extract only the numeric data variables\nnumDF = pd.DataFrame(\n pkmndata[[\"Total\", \"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]])\n\n# Correlation Matrix\nprint(numDF.corr())\n\n# Heatmap of the Correlation Matrix\nf, axes = plt.subplots(1, 1, figsize=(18, 12))\nsb.heatmap(numDF.corr(), vmin=-1, vmax=1, annot=True,\n fmt=\".2f\", annot_kws={\"size\": 18}, cmap=\"RdBu\")\n\n\n# Draw pairs of variables against one another\nsb.pairplot(data=numDF)\n\n\n# ### Uni-Variate Regression\n#\n# We will start by setting up a Uni-Variate Linear Regression problem.\n#\n# > Regression Model : Response = $a$ $\\times$ Predictor + $b$\n#\n# Check the mutual relationship between the variables to start with.\n\n\n# Set up the problem with Predictor(s) and Response\npredictor = \"HP\"\nresponse = \"Total\"\n\n# 2D scatterplot of two variables to observe their relationship\nf = plt.figure(figsize=(16, 8))\nsb.scatterplot(x=predictor, y=response, data=pkmndata)\n\n\n# Extract the Response and Predictor variables as two individual Pandas `DataFrame`.\n\n\n# Extract Response and Predictors\ny = pd.DataFrame(pkmndata[response])\nX = pd.DataFrame(pkmndata[predictor])\n\n\n# Split the dataset randomly into Train and Test datasets using `train_test_split`.\n\n\n# Split the Dataset into Train and Test\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n\n\n# `LinearRegression` is a class for the regression model in `sklearn`.\n# We need to create an object of the `LinearRegression` class, as follows.\n\n\n# Create a Linear Regression object\nlinreg = LinearRegression()\n\n\n# Train the Linear Regression model using the Train Set `X_train` and `y_train`.\n\n\n# Train the Linear Regression model\nlinreg.fit(X_train, y_train)\n\n\n# You have *trained* the model to fit the following formula.\n#\n# > Regression Problem : Response = $a$ $\\times$ Predictor + $b$\n#\n# Check Intercept ($b$) and Coefficient ($a$) of the regression line.\n\n\n# Coefficients of the Linear Regression line\nprint('Intercept \\t b = ', linreg.intercept_)\nprint('Coefficients \\t a = ', linreg.coef_)\n\n\n# Predict the response variable using the model you just trained.\n\n\n# Predict the Response on the Train Set\ny_train_pred = linreg.predict(X_train)\n\n# Plot the Linear Regression line\nf = plt.figure(figsize=(16, 8))\nplt.scatter(X_train, y_train)\nplt.scatter(X_train, y_train_pred, color=\"red\")\nplt.show()\n\n\n# Check the *Goodness of Fit* on the Train and Test Sets.\n# Metrics : Explained Variance and Mean Squared Error.\n\n\n# Explained Variance (R^2) on Train Set\nprint(\"Explained Variance (R^2) on Train Set \\t\", linreg.score(X_train, y_train))\n\n# Mean Squared Error (MSE) on Train Set\ny_train_pred = linreg.predict(X_train)\nprint(\"Mean Squared Error (MSE) on Train Set \\t\",\n mean_squared_error(y_train, y_train_pred))\n\n# Mean Squared Error (MSE) on Test Set\ny_test_pred = linreg.predict(X_test)\nprint(\"Mean Squared Error (MSE) on Test Set \\t\",\n mean_squared_error(y_test, y_test_pred))\n\n\n# It is quite meaningful to check the Predictions against the True values of the Response variable.\n\n", "project_metadata": {"full_name": "sgsourav/python-programming", "description": "AI6120 : Course on Python Programming for NTU SCSE MSAI Program", "topics": [], "git_url": "git://github.com/sgsourav/python-programming.git", "stars": 18, "watchers": 18, "forks": 13, "created": "2020-01-14T02:43:30Z", "size": 4892, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 185578, "Python": 46012}, "last_updated": "2020-11-04T08:41:19Z"}, "intent": "# Predict the Response for both Train and Test"}, {"original_comment": "# find features with high correlation\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Practice plotting\n\n# Here, we'll do some exploratory data analysis (EDA) of the pitchfx data. EDA is a quick and dirty way to determine what is in your dataset. There are many ways to do this, and plotting the data is a key way of looking at the data. Note that this is mainly practice in plotting a bunch of different types of figures.\n#\n# For reference, a description of the *pitchfx* variables can be found here: https://fastballs.wordpress.com/category/pitchfx-glossary/\n#\n# Begin by importing the necessary libraries and specifying a name for the database you want to create.\n\n#%%\n\n# imports\nfrom IPython.display import display\nfrom mpl_toolkits.mplot3d import Axes3D\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport scipy.cluster.hierarchy as hier\nimport scipy.stats as stats\nimport sklearn.cluster as clust\nimport sklearn.decomposition as dd\nimport sklearn.neighbors as nn\nimport sklearn.preprocessing as pp\nimport sqlite3\nimport subprocess\n\n# specify database name\ndbname = \"../../dat/pitchfx2008.db\"\n\n\n# We'll work with a subset of the dataset when performing EDA. Specifically, we will look at the pitches from a single pitcher. Here, it will be CC Sabathia.\n\n#%%\n\n# connect to the sqlite3 database\ndb = sqlite3.connect(dbname)\nhdb = db.cursor()\n\n#%%\n\n# get specific pitcher info\nquery = \"\"\"SELECT DISTINCT pitchfx.* \n FROM pitchfx\n JOIN events ON (pitchfx.game_id=events.game_id\n AND pitchfx.cur_event=events.event_id)\n WHERE events.pitcher_id=(SELECT player_id\n FROM players\n WHERE players.player_first='CC'\n AND players.player_last='Sabathia')\n ORDER BY game_id, pitch_num\"\"\"\ndf = pd.read_sql_query(query, db)\ndf.head()\n\n\n# ### Printed summaries of data\n\n# One of the first things you can do is to get a typed summary of the data set. We can do this in pandas, which has similar functions as in R. The *describe* function in pandas is similar to the *summary* function in R. Note that function only returns quantitative variables.\n\n#%%\n\n# print out summary of pandas dataframe\n# df.describe()\n\n# transpose it to see all the stats\ndf.describe().transpose()\n\n\n# Note that you can also look at the stats for a single feature. Here, we'll look at *spin rate*.\n\n#%%\n\n# show stats for spin rate\ndf['spin_rate'].describe()\n\n\n# Note that you can also store data in a different variable name and perform similar summary commands. Here, we will look at the percentiles (specifically the 0.1 and 0.5 percentile). The 50% percentile should match the median.\n\n#%%\n\n# store spin rate in a different variable\nspin_rate = df[\"spin_rate\"]\nspin_rate.quantile([0.1, 0.5])\n\n\n# ### Boxplots\n\n# Boxplots are common when visualizing quartile information. We'll stay with *spin rate* for our boxplot. Before plotting the data, we'll need to remove the nans.\n#\n# Note that when extracting a single column from a dataframe, the column is now a *series* object. The series should be converted into a numpy array if plotting from matplotlib and not pandas (here we are plotting directly from a pandas object).\n#\n# The red line is the median, the edges of the box are the 25% and 75% quartiles, and the the lines below and above the box correspond to some factor (*whis* in the arguments) multiplied by the interquartile range (Q3-Q1). It is meant to help identify outliers.\n\n#%%\n\n# remove nans\nspin_rate = spin_rate.dropna()\n\n# create boxplot\n#plt.boxplot(spin_rate.as_matrix(), whis=1.5)\nspin_rate.plot.box(whis=1.5)\nplt.show()\n\n\n# Now, let's create boxplots of velocity sorted by pitch types.\n\n#%%\n\n# create plot\ndf.boxplot(column=\"start_speed\", by=\"pitch_type\")\nplt.show()\n\n\n# As we can see, Sabathia's fastball has the highest starting velocity, followed by his sinker, change-up, slider, and cutter.\n\n# Next, we'll look at how horizontal movement in inches (pfx_x) varies with pitch type.\n\n#%%\n\n# create plot\ndf.boxplot(column=\"pfx_x\", by=\"pitch_type\")\nplt.show()\n\n\n# As we can see, horizontal movement varies dramatically between sliders and changes-ups. Perhaps surprisingly, data suggests that Sabathia's slider has less horizontal movement than his change-up. It is assumed that, from the hitter's and catcher's perspective, negative values correspond to movement right to left and positive values correspond to movement left to right (based on Sabathia's handedness and the natural break of the slider).\n\n# Now let's look at vertical movement.\n\n#%%\n\n# create plot\ndf.boxplot(column=\"pfx_z\", by=\"pitch_type\")\nplt.show()\n\n\n# We see that many pitches have \"upward\" movement, which doesn't make sense. However, given the way movement is defined, this can make sense. For example, most pitches, with curveballs being the main exception, have some backspin, which causes them not to drop as much as a spinless pitch (https://www.beyondtheboxscore.com/2009/4/17/841366/understanding-pitch-f-x-graphs). Note that again, sliders and change-ups are well separated from each other, but many pitches have positive vertical movement.\n\n# We'll look more closely at how often each of these pitches is thrown in a later section.\n\n# ### Histograms\n\n# Next, we'll create a histogram of the same *spin_rate* data. We'll use *matplotlib* directly here. We'll show the effects of plotting histograms with different bin sizes side by side.\n\n#%%\n\n# convert series data to numpy array before plotting\n# share y-axis to show difference in values\nf, (ax1, ax2) = plt.subplots(1, 2, sharey=True)\nax1.hist(spin_rate.as_matrix())\nax1.set_xlabel(\"Spin rate\")\nax1.set_xticks(np.arange(0, 4000, 1000))\nax2.hist(spin_rate.as_matrix(), bins=20)\nax2.set_xlabel(\"Spin rate\")\nax2.set_xticks(np.arange(0, 4000, 1000))\nplt.show()\n\n\n# We can also use seaborn to creat histograms with rug plots below. The rug plot is a fine-grain overview of the data, as opposed to a histogram, which bins the data. However, with so many data points the rug plot is not too useful. Note that the y-axis is normalized by default, as this is a distribution.\n\n#%%\n\nfig, ax = plt.subplots()\nsns.distplot(spin_rate.as_matrix(), rug=True, hist=True, rug_kws={\"color\": \"g\"},\n kde_kws={\"color\": \"k\", \"lw\": 3})\nax.set_xlabel(\"Spin rate\")\nplt.show()\n\n\n# ### Bar charts\n\n# We will now create a bar chart of all the pitch types thrown by CC Sabathia. First, let's get a summary of the pitch type information.\n\n#%%\n\n# describe pitch type information\ndf[\"pitch_type\"].describe()\n\n\n# There are NaNs in the data, so let's clean them out and put them in a new variable name.\n\n#%%\n\npitch_type = df[\"pitch_type\"].dropna()\n\n\n# Next, let's create a bar plot of all the pitch types.\n\n#%%\n\nfig, ax = plt.subplots()\npitch_type.value_counts().plot(ax=ax, kind=\"bar\")\nax.set_xlabel(\"Pitch type\")\nax.set_ylabel(\"Frequency\")\nplt.show()\n\n\n# We can see that Sabathia complements his four-seam fastball with a slider, change-up, and sinker. There are a few pitches classified as cutters, but they are unlikely to actually be cutters given how infrequent they are throughout the season.\n\n# ### Scatter plots\n\n# Here, we'll look at the raw data through scatter plots of various measurements. First, we'll pull up a summary of the pitchfx parameters that are available.\n\n#%%\n\n# get variable names from pitch fx with data types\nprint(*[df.dtypes], sep=\"\\n\")\n\n\n# Most of these variables are going to be useful for classifying pitches. However, not all of them will be useful. Specifically, any variable not directly related to the trajectory of the ball. Therefore, we will chop off the first eleven variables, which have mostly to do with the times and situations at which the pitch was thrown. For a description of each of the features, have a look at: https://fastballs.wordpress.com/category/pitchfx-glossary.\n\n#%%\n\n# create truncated pitchfx data frame\npdf = df.iloc[:, 11:].dropna()\n\n\n# Now we have a smaller data frame with fewer features. Let make a scatter plots of all these variables to see if there are any clear trends. Note that we will drop all rows containing NaNs to make the analysis easier. Typically where there are NaNs, it means that there was no measurment made of the pitch trajectory information. Therefore, we drop the entire row. We will also drop the last column, which is the pitch classification from MLB.\n#\n# Here, we will plot all features but only 100 pitches.\n\n#%%\n\n#pd.scatter_matrix(pdf.iloc[:100, :-1].dropna(axis=0, how='all'), alpha=0.2, figsize=(10, 10))\npd.scatter_matrix(pdf.iloc[:1000, :-1].dropna(axis=0,\n how='all'), alpha=0.2, figsize=(10, 10))\nplt.show()\n\n\n# There is a lot of information in this scatter matrix, and it's difficult to pick out certain trends. However, we will try to explain some of what we see.\n#\n# 1) yo and break_y seem to be variables with a small spread of values. The former is the distance in feet from home plate where the PITCHf/x system is set to measure the initial parameters, and it is typically fixed after the year 2007. The latter is the distance in feet from home plate to the point in the pitch trajectory where the pitch achieved its greatest deviation from the straight line path between the release point and the front of home plate. This trait is potentially a fixed value for each game/stadium and does not appear to be very insightful.\n#\n# 2) There are very strong linear trends amongst certain features. Among them are x and px and y and pz. All these features are related to the location of the ball crossing the plate, with x and y tied to the old location system and px and pz related to the pitchfx system. Additionally, there seem to be strong trends between ax and pfx_x and az and pfx_z. This suggets that movement (in inches) is tied to acceleration (in ft/s), which is not surprising. Also, there is a strong correlation between start_speed and vy0, which is not surprising given that they are measuring speeds toward the plate.\n#\n# 3) There appears to be certain features that produce distinct clusters. For instance, start_speed and spin_rate, start_speed and pfx_x (horizontal movement), and start_speed and break_angle. These might be useful features when attempting to classify pitch types.\n\n# ### Cross plots\n\n# Let's focus on some of the more interesting plots. From point number 3, it seems that useful parameters to look at are end_speed, spin_rate, and pfx_x (horizontal movement). Let's put all of these into one crossplot, with end speed (mph) and spin rate (rpm) as the axes and horizontal movement as the color.\n\n#%%\n\n# crossplot of start speed and spin rate, colored by horizontal movement\nplt.scatter(pdf[\"start_speed\"], pdf['spin_rate'],\n c=pdf['pfx_x'], cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"spin rate (rpm)\")\nplt.colorbar(label=\"horizontal movement (in)\")\nplt.show()\n\n\n# There are three clear clusters!\n#\n# The slowest pitch has a low spin rate and significant movement from right to left (from a left-handed pitcher), which suggests that this pitch is a slider. The second slowest cluster has a higher spin rate and velocity than the slider, and has movement from left to right, which suggests this pitch is his change-up. The third cluster appears to contain two types of pitches, based on the difference in horizontal movement. From looking at the previous bar charts of horizontal movement grouped by pitch type, the sinker has more horizontal movement than the four-seam fastball. Therefore, it looks like we could separate pitches in the third cluster with another axis.\n\n# Now let's compare our results to the given pitch classifications. We'll plot velocity against spin rate, and color by the pitch type.\n\n#%%\n\n# crossplot of spin rate and start speed, colored by pitch type\nsns.pairplot(x_vars=\"start_speed\", y_vars=\"spin_rate\",\n data=pdf, hue=\"pitch_type\", size=5)\nplt.show()\n\n\n# We can see that two of our classifications (slider and changeup) are separated and clustered as we had hypothesized. We can also see that the sinker and four-seam fastball are overlain on the third cluster. However, the spread in spin rate of the sinker appears greater than hypothesized in the previous crossplot.\n\n# Let's get another view of the clusters. Here, we'll switch two of the axes: spin rate and horizontal movement. We will now color by spin rate.\n\n#%%\n\n# crossplot of start speed and spin rate, colored by horizontal movement\nplt.scatter(pdf[\"start_speed\"], pdf['pfx_x'],\n c=pdf['spin_rate'], cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"horizontal movement (in)\")\nplt.colorbar(label=\"spin rate (rpm)\")\nplt.show()\n\n\n# It appears that the higher the spin rate, the greater the horizontal movement, which makes sense. However, it doesn't appear to separate sinkers and fastballs that clearly. We know that sinkers typically have more movement, so it's likely that the higher spin rate pitches are sinkers.\n\n# Let's plot the same scatter plot but color by the classifications from the dataset. We'll see how our hypothesis that sinkers have more spin compares to the given labels.\n\n#%%\n\n# crossplot of start speed and horizontal movement, colored by spin rate\nsns.pairplot(x_vars=\"start_speed\", y_vars=\"pfx_x\",\n data=pdf, hue=\"pitch_type\", size=5)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"horizontal movement (in)\")\nplt.show()\n\n\n# As we can see, the sinker has more horizontal movement compared to the four-seam fastball. Therefore, it seems like it's possible to separate the pitches given the features we have. Next, we'll perform some dimension reduction to see if we can separate these events better and to better-understand which features are important.\n\n# ### Dimension reduction\n\n# Dimension reduction is a way to transform the full data set into a lower-dimension representation. We'll look at principal component analysis (PCA) here, as it is a pretty standard method.\n\n# PCA performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. We can potentially separate these pitches in a different space, as well as infer which features are important for data separation.\n\n# As first step, we need to standardize the features. This is an important step here, as the features are very different units and have very different scales. We'll remove the means and divide by the variance of each feature. Remember to remove the categorical features here, as they won't be useful in this case. Additionally, we will remove numerical features that are just indices (e.g., pitch number and event number).\n\n# Note that an alternative to use *OneHotEncoder* to encode the categorical variables instead of dropping them (see: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder).\n\n# A note about sklearn. The pipeline is usually that you initialize your sklearn objects (e.g., standardizing variables, svm, pca, etc.). Next, you can run something like *fit*, *transform*, or *fit_transform*. *fit* means that the output is some sort of value(s). *transform* means that the output has the same dimensions as the input. Sometimes you run *fit* then *transform*, in which case you can use *fit_transform* if you don't care about saving the intermediate values.\n#\n# In the case of standardizing variables, the *fit* portion is when you calculate the mean and standard deviations of the features. The *transform* portion is the actual standardization of the features.\n\n#%%\n\n# remove categorical variables\npfx_vals = df.drop(df.dtypes.index[df.dtypes.values == \"object\"], 1).dropna()\n\n# remove indexing variables\npfx_vals = pfx_vals.drop([\"game_id\",\n \"pitch_num\",\n \"at_bat\",\n \"time\",\n \"cur_event\"], axis=1)\n\n# this is how to standardize the variables in two steps\n# standardize the features\n#stand = pp.StandardScaler()\n#pfx_std = stand.fit(pfx_vals)\n#\n# now standardize the features\n#pfx_vals_std = pfx_std.transform(pfx_vals)\n\n# this is how to standardize the variables in one step\nstand = pp.StandardScaler()\npfx_vals_std = stand.fit_transform(pfx_vals)\n\n\n# Now let's run some PCA now that the features are standardized. We will only use *fit* here, as we're interested in looking at the principal components and the explained variance ratios. Note that we are also turning *whiten* on, so components_ vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions.\n\n#%%\n\n# create pca\npca = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca = pca.fit(pfx_vals_std)\n\n\n# Let's plot the percent of variance explained by each principal component and the cumulative sum of the variance explained on the same plot. This type of plot can help determine how many principal components to keep for data compression or how much variance can be explained when visualizing the data.\n\n#%%\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 34])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n\n# There are a few observations we can make. First, note that we've added a black horizontal dashed line at 90% of the data variance explained. We can see that we can explain 90% of variance in the data with just 8 principal components (as opposed to the full 33 components). Therefore, we can greatly compress the size of our data set if we are willing to be unable to explain 10% of the variance. Second, there is a clear \"elbow\" after the first principal component in our portion of variance explained plot. In other words, while the first principal component only explains roughly 40% of the variance in the data, further principal components do not explain a substantial amount of variance in the data. Therefore, nearly half the variance in the data is explained by the first two principal components, which is useful to know for visualizing our data in 2D.\n\n# So, let's plot our data's PCA scores over the first two principal components, colored by the third principal component.\n\n#%%\n\n# transform the data into pca space\npfx_pca_trans = pfx_pca.transform(pfx_vals_std)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans[:, 0], pfx_pca_trans[:, 1], c=pfx_pca_trans[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# As we can see, there is a clear separation along the first principal component into two clusters. This trend of two clusters explains why the first principal component already explains 40% of the variance in the data. The second principal component shows relatively less variation along its axis, and the third (color) less than that. It appears that there is not much clear information about different types of pitches here (thought a third cluster might be interpretable in the figure).\n\n# Note that we can get the PCA transforms ourselves without having to use the *transform* feature. We can get the same result by multiplying the original scaled features by the components matrix. For instance, we can get the first principal component projects using the code below.\n\n#%%\n\n# first principal component\nloadings1 = pfx_pca.components_[0, :]\nprint(np.dot(pfx_vals_std, loadings1))\nprint(pfx_pca_trans[:, 0])\n\n\n# Let's again plot the first two principal components, but color by the labeled pitches from the database.\n\n#%%\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame(\n {\"pca1\": pfx_pca_trans[:, 0], \"pca2\": pfx_pca_trans[:, 1], \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Here, we see that the sliders are the easiest pitch to separate, as they are clearly in their own cluster. However, we see that the rest of the three pitches are grouped into one cluster (changeup, sinker, fastball). These three pitches are spread simiarly along the second principal component axis. Therefore, PCA doesn't seem to be helping to separate pitches much here.\n\n# Let's take a closer look at which features the principal components are extracting at the \"most important\" in terms of variance. To find the most important variables in terms of their contributions to the principal component, we will look at loadings. Loadings are the projections of the principal components onto your variables. A particularly high (or particularly low) loading for a specific variable means that principal component is intimately related to the variable\n#\n# We'll do this by looking for the largest values in the first principal component (first row of the component matrix here), and then the second principal component. Note that the components matrix has the shape (n_components, n_features). A biplot would be useful here... Instead, we use plot bar charts of the loadings and inspect important features manually.\n\n#%%\n\n# first principal component\nloadings1 = pfx_pca.components_[0, :]\n\n# intialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals.shape[1]), list(pfx_vals), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals.shape[1]), list(pfx_vals), rotation=90)\nplt.show()\n\n\n# Let's focus on the first principal component first (left). We see that many of the features that relate to the ball trajectory (e.g., velocity, break length/angle, acceleration, movement) appear to be the most important features for the first principal component. From our previous analysis, we know that we can obtain relatively clear clusters plotting only these parameters.\n#\n# Looking at the second principal component (right), it seems that the most important features are related to mostly the location of the pitches. While this is useful information, it is unlike to be much use in classifying pitches.\n#\n# We will do some further \"cleaning\" of the data to tailor to the goal of pitch clustering. Specifically, we will remove all features that do not contribute to ball trajectory (i.e., pitch locations) and see what we get.\n\n#%%\n\n# remove unimportant features\npfx_vals_clean = pfx_vals.drop([\"pre_balls\",\n \"post_balls\",\n \"pre_strike\",\n \"post_strike\",\n \"sz_top\",\n \"sz_bot\",\n \"x\",\n \"y\",\n \"px\",\n \"pz\",\n \"x0\",\n \"y0\",\n \"z0\",\n \"break_y\"], axis=1)\n\n# standardize data\npfx_vals_std_clean = stand.fit_transform(pfx_vals_clean)\n\n\n# Now let's look at PCA again with the new cleaned data.\n\n#%%\n\n# create pca\npca_clean = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca_clean = pca_clean.fit(pfx_vals_std_clean)\n\n\n# As before, let's plot the percent of variance explained by each principal component and the cumulative sum of the variance explained on the same plot.\n\n#%%\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca_clean.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca_clean.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 25])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n\n# We can see that the first principal component already explains 70% of the variance in the dataset.\n\n# Let's plot the PCA scores again for the first three principal components.\n\n#%%\n\n# transform the data into pca space\npfx_pca_trans_clean = pfx_pca_clean.transform(pfx_vals_std_clean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_clean[:, 0], pfx_pca_trans_clean[:, 1], c=pfx_pca_trans_clean[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# We see that the data is well separated along the first principal component but not as well along the second principal component, as expected.\n\n# Again, let's plot the first two principal components, but color by the labeled pitches from the database.\n\n#%%\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame(\n {\"pca1\": pfx_pca_trans_clean[:, 0], \"pca2\": pfx_pca_trans_clean[:, 1], \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Again, the sliders are well-separated but the other three pitches are not.\n\n# Again, let's look at the loadings of the first two principal components to get a sense of which features might be significant.\n\n#%%\n\n# initialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# first principal component\nloadings1 = pfx_pca_clean.components_[0, :]\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca_clean.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\nplt.show()\n\n\n# Let's also look at the loadings for the third and fourth principal components.\n\n#%%\n\n# initialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# first principal component\nloadings3 = pfx_pca_clean.components_[2, :]\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings3), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 3\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\n# second principal component\nloadings4 = pfx_pca_clean.components_[3, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings4), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 4\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\nplt.show()\n\n\n# From these plots, we see that all features have similar loadings along the first principal component. This suggests that most of our features display similar variance. We also see that the third principal component corresponds roughly to intial velocity in the horizontal direction, and it does not provide much insight into pitch type separation. A biplot could help determine how much these features are correlated to each other.\n\n#%%\n\n# transform the data into pca space\npfx_pca_trans_clean = pfx_pca_clean.transform(pfx_vals_std_clean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_clean[:, 1],\n pfx_pca_trans_clean[:, 2],\n c=pfx_pca_trans_clean[:, 0],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n#%%\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame({\"pca1\": pfx_pca_trans_clean[:, 1],\n \"pca2\": pfx_pca_trans_clean[:, 2],\n \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Now we'll try with an even cleaner data set, where we are only keeping parameters that we suspect are not correlated.\n\n#%%\n\n# remove unimportant features\npfx_vals_vclean = pfx_vals[[\"pfx_x\",\n \"pfx_z\",\n \"spin_rate\",\n \"start_speed\"]]\n\n# standardize data\npfx_vals_std_vclean = stand.fit_transform(pfx_vals_vclean)\n\n#%%\n\n# create pca\npca_vclean = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca_vclean = pca_vclean.fit(pfx_vals_std_vclean)\n\n#%%\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca_vclean.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca_vclean.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 25])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n#%%\n\n# transform the data into pca space\npfx_pca_trans_vclean = pfx_pca_vclean.transform(pfx_vals_std_vclean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_vclean[:, 0],\n pfx_pca_trans_vclean[:, 1],\n c=pfx_pca_trans_vclean[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n#%%\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame({\"pca1\": pfx_pca_trans_vclean[:, 0],\n \"pca2\": pfx_pca_trans_vclean[:, 1],\n \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n#%%\n\n# first principal component\nloadings1 = pfx_pca_vclean.components_[0, :]\n\n# intialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_vclean.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_vclean.shape[1]), list(pfx_vals_vclean), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca_vclean.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_vclean.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_vclean.shape[1]), list(pfx_vals_vclean), rotation=90)\nplt.show()\n\n\n# Overall, we see that PCA does not seem to be able to separate pitch clusters more clearly. We still see three clusters for the most part, with the troublesome spot being the difference between the four-seam fastball (FF) and the sinker (SI).\n\n# ### Hierarchical clustering\n\n# We'll perform hierarchical clustering here to try to group the data. We start with hierarchical clustering because it is useful when there is no prior knowledge of the number of clusters (although that isn't exactly the case here). We will look at bottom-up clustering here. All data points begin as their own cluster, in some ways. Then we fuse data points together that are similar to each other, and continue until all data points are in one cluster.\n#\n# There are some controls over how clusters are fused. There are four common types of linkage (complete, average, single, centroid). There are also dissimilarity measures, such as Euclidean distance and correlation-based distances. For the case of pitch classification, we will start for Euclidean distance, as we're looking for feature values with similar values.\n\n# We will use scipy for this sort of clustering. We will perform hierarchical clustering with complete linkage and euclidean distances using the data with trajectory information only.\n\n#%%\n\n# generate the linkage matrix\nZc = hier.linkage(pfx_vals_std_clean, method='complete', metric='euclidean')\n\n\n# Let's try to plot a dendrogram.\n\n#%%\n\n# plot the dendrogram colored by a certain height\nheightc = 11\nfig = plt.figure(figsize=(25, 10))\ndn = hier.dendrogram(Zc, color_threshold=heightc, leaf_rotation=90)\n\n\n# As we can see, there appears to be three to four natural clusters when using complete linkage (we highlighted three here). However, one of the clusters is relative small and leads to an unbalanced tree. Let's see what these clusters correspond to on a plot of starting speed vs horizontal movement.\n\n#%%\n\n# labels\nlabel = hier.fcluster(Zc, heightc, 'distance')\n\n# values for plot\nhor_mv = pfx_vals_vclean[\"pfx_x\"]\nspeed = pfx_vals_vclean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=label, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We can see that complete linkage doesn't do a very good job here of clustering, as two seemingly obvious clusters are grouped together into one. Let's try ward linkage now to see if we can obtain a more balanced tree. Ward minimizes the variance of the clusters being merged.\n\n#%%\n\n# generate the linkage matrix\nZw = hier.linkage(pfx_vals_std_clean, method='ward', metric='euclidean')\n# plot the dendrogram colored by a certain height\nheightw = 60\nfig = plt.figure(figsize=(25, 10))\ndn = hier.dendrogram(Zw, color_threshold=heightw, leaf_rotation=90)\n\n#%%\n\n# labels\nlabel = hier.fcluster(Zw, 50, 'distance')\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=label, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It appears that we get three reasonable clusters.\n\n# ### Density plots\n\n# We know that the fastball and sinker are difficult to separate. Let's create density plots to see if there are potentially smaller clusters hidden in the three obvious clusters from hierarchical clustering.\n\n# Start by isolating the blue cluster in the previous plot, which contains slow pitches than have a lot of break from right to left. We are likely looking at sliders here.\n\n#%%\n\n# label of largest cluster\nlabel = 1\n\n# find indices\nc1_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c1_ind], speed.iloc[c1_ind],\n color=\"blue\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# As we can see, there appears to be only one cluster in here, which was expected. This is likely Sabathia's slider.\n\n# Next, we'll look at the green cluster, which is slightly faster than the slider but breaks from left to right.\n\n#%%\n\n# label of largest cluster\nlabel = 2\n\n# find indices\nc2_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c2_ind], speed.iloc[c2_ind],\n color=\"green\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# Here, we can see that again, there is only one cluster, which likely corresponds to Sabathia's change-up. However, the spread of values is surprisingly wide.\n\n# Finally, we'll focus on the largest (red) cluster, which presumably has the four-seam fastball and sinker. This is the key problem, as we have not been able to separate the two pitch types through PCA. We'll look for whether there are multiple dense clusters that are just hidden.\n\n#%%\n\n# label of largest cluster\nlabel = stats.mode(hier.fcluster(Zw, 50, 'distance'))[0][0]\n\n# find indices\nc3_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c3_ind], speed.iloc[c3_ind],\n color=\"red\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It appears that there is only only cluster here, although it spans a wide range of velocities and horizontal movement. It is possible that the sinker is just a four-seamer with more break. Regardless, it does not appear that these two particular pitches can be separated.\n\n# Based on the spread of the change-up cluster, we'll look at DBSCAN and see if it does a better job.\n\n# ### DBSCAN\n\n# Density-based spatial clustering of applications with noise (DBSCAN) groups together data observations that are close together. This approach has the advantages that, again, the number of clusters beforehand does not need to be known, it is robust to outliers (or noise features that are not truly associated with the response), and can find arbitrarily shaped clusters.\n#\n# The algorithm (https://algorithmicthoughts.wordpress.com/2013/05/29/machine-learning-dbscan/):\n#\n# 1) make an n-dimensional sphere of radius epsilon around the point and count the number of data points within the sphere.\n#\n# 2) if the number of points within the sphere are more than min_points then we mark the center of the sphere to be belonging to a cluster. We also mark the points inside the sphere to be belonging to the same cluster. We then recursively expand the cluster by applying the same criteria to the points inside the sphere, except the center.\n#\n# 3) in case the number of points inside the sphere are less than min_points, we ignore it and proceed to the next point in the dataset.\n#\n# For the algorithm (https://en.wikipedia.org/wiki/DBSCAN), we need to define three parameters: the minimum number of points to create a dense region, the maximum distance between two samples for data observations to be considered as in the same neighborhood, and a distance metric. The first has a rule of thumb of 2\\*(dimensionality of data), and larger values are usually better for data sets with noise and will yield more significant clusters. The second can be estimated from a K-distance plot, looking for an elbow point. The third we'll use euclidean distance here.\n\n# Let's first create a K-nearest-neighbor distance plot to try to estimate the epsilon parameter, which affects the size and number of clusters. Since our data has 14 features, we will set our `min_points` parameter to 28. Have a look at http://www.sthda.com/english/articles/30-advanced-clustering/105-dbscan-density-based-clustering-essentials/\n\n#%%\n\n# set min_points parameter\nmin_points = 2*pfx_vals_std_clean.shape[1]\n\n# initialize nearest-neighbor object\nnbrs = nn.NearestNeighbors(n_neighbors=min_points, metric=\"euclidean\")\n\n# fit nearest-neighbors\nnbrs_fit = nbrs.fit(pfx_vals_std_clean)\nkdist, kind = nbrs_fit.kneighbors(pfx_vals_std_clean)\n\n\n# `kdist` contains distance values from points to neighbors. `kind` contains corresponding to indices of the nearest neighbors. Let's plot the distance of the `min_points`-1 (here 27) nearest neighbor for each data point, sorted by distance.\n\n#%%\n\n# get distances and sort\nk27 = np.sort(kdist[:, -1])\n\n# plot sorted distance as a function of data point index\nplt.plot(k27)\nplt.xlabel(\"observation index\")\nplt.ylabel(\"euclidean distance\")\nplt.show()\n\n\n# Looking at this plot, it seems like an `eps` of 1.75 might be a reasonable value, as it is the elbow point. However, we will choose a lower value of 1, as this plot is potentially biased by the first few indices. Now that we have our parameters, we can try out DBSCAN.\n\n#%%\n\n# initialize variables\neps = 1\n\n# initialize dbscan object\ndbscan = clust.DBSCAN(eps=eps, min_samples=min_points, metric=\"euclidean\")\n\n# fit and predict labels with dbscan\ndbscan_pred = dbscan.fit_predict(pfx_vals_std_clean)\n\n\n# Let's take a look at the cluster results in our usual scatter plot.\n\n#%%\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We can see that two clear clusters are separated but have many outliers within them. This is potentially related to the curse of dimensionality, as adding noise features that are not truly associated with the reponse will hurt density measurements. Instead, we will try DBSCAN in PCA space, where we now only have two dimensions. We will also increase the minimum number of points needed for a core point.\n\n#%%\n\n# get pca data and scale\npca_data = pfx_pca_trans_clean[:, :2]\nstand = pp.StandardScaler()\npca_stand = stand.fit_transform(pca_data)\n\n# set min_points parameter\nmin_points = 8*pca_stand.shape[1]\n\n# initialize nearest-neighbor object\nnbrs = nn.NearestNeighbors(n_neighbors=min_points, metric=\"euclidean\")\n\n# fit nearest-neighbors\nnbrs_fit = nbrs.fit(pca_stand)\nkdist, kind = nbrs_fit.kneighbors(pca_stand)\n\n# get distances and sort\nk3 = np.sort(kdist[:, -1])\n\n# plot sorted distance as a function of data point index\nplt.plot(k3)\nplt.xlabel(\"observation index\")\nplt.ylabel(\"euclidean distance\")\nplt.show()\n\n\n# Again, there is an elbow point, but a smaller epsilon value than 0.2 might be useful here, as two clusters of the three clusters in PCA space are not well separated (but still distinguishable).\n\n#%%\n\n# initialize variables\neps = 0.16\n\n# initialize dbscan object\ndbscan = clust.DBSCAN(eps=eps, min_samples=min_points, metric=\"euclidean\")\n\n# fit and predict labels with dbscan\ndbscan_pred = dbscan.fit_predict(pca_stand)\n\n\n# We'll project our cluster labels into PCA space to see how DBSCAN did visually.\n\n#%%\n\n# values for plot\n#hor_mv = pfx_vals_clean[\"pfx_x\"]\n#speed = pfx_vals_clean[\"start_speed\"]\nhor_mv = pca_data[:, 0]\nspeed = pca_data[:, 1]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# We can see that the most dense parts of the clusters are well separated, with a lot of outliers also identified. Let's see what the classifications look like in actual feature space.\n\n#%%\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It looks fairly similar to the results using all trajectory features. However, most of the outliers are along the edges of the clusters, unlike when using all trajectory features.\n\n# Overall, it seems that Sabathia has three primary types of pitches:\n#\n# 1) \"Hard\" pitches; ones that comes in fast with various amounts of horizontal break, which determine the difference between four-seam fastballs and sinkers (yellow cluster above)\n#\n# 2) \"Off-speed\" pitches; ones that have similar horizontal break as \"hard\" pitches, but slower (blue cluster above)\n#\n# 3) \"Breaking\" pitches; ones that have very different horizontal break compared to the other two pitches (red cluster above)\n\n# ### K-means clustering\n\n# We'll now try K-means clustering. Every data point will be a part of a cluster, but we need to specify the number of clusters beforehand. Here we set it to 3, and also run Kmeans on the trajectory information.\n\n#%%\n\n# initialize parameters\nnclust = 3\nniter = 500\n\n# run kmeans and predict label\nklabels = clust.KMeans(n_clusters=nclust, random_state=0,\n n_init=niter).fit_predict(pfx_vals_std_clean)\n\n\n# Let's plot the results in our usual domain.\n\n#%%\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=klabels, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# Next, we'll try clustering in our PCA space.\n\n#%%\n\n# run kmeans and predict label\nklabels = clust.KMeans(n_clusters=nclust, random_state=0,\n n_init=niter).fit_predict(pca_stand)\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=klabels, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We see that the cluster results in PCA space lead to less-convincing clusters. This is expected, given that the clusters are not as well separated in PCA space as in start speed and horizontal movement space.\n\n# ### Feature correlation\n\n# Highly-correlated features can be an issue, particularly when using Euclidean distances as a metric. This is why we turned to clustering in PCA space, as PCA in some ways decorrelates the features through an orthogonal linear transformation. Here, we'll look at the correlation matrix of features to find features that are highly correlated.\n\n#%%\n\n# create correlation matrix of features\ncof = np.corrcoef(pfx_vals_clean.T)", "target_code": "ff = np.argwhere(np.logical_and(abs(cof) > 0.95, cof != 1))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Practice plotting\n\n# Here, we'll do some exploratory data analysis (EDA) of the pitchfx data. EDA is a quick and dirty way to determine what is in your dataset. There are many ways to do this, and plotting the data is a key way of looking at the data. Note that this is mainly practice in plotting a bunch of different types of figures.\n#\n# For reference, a description of the *pitchfx* variables can be found here: https://fastballs.wordpress.com/category/pitchfx-glossary/\n#\n# Begin by importing the necessary libraries and specifying a name for the database you want to create.\n\n\n# imports\nfrom IPython.display import display\nfrom mpl_toolkits.mplot3d import Axes3D\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport scipy.cluster.hierarchy as hier\nimport scipy.stats as stats\nimport sklearn.cluster as clust\nimport sklearn.decomposition as dd\nimport sklearn.neighbors as nn\nimport sklearn.preprocessing as pp\nimport sqlite3\nimport subprocess\n\n# specify database name\ndbname = \"../../dat/pitchfx2008.db\"\n\n\n# We'll work with a subset of the dataset when performing EDA. Specifically, we will look at the pitches from a single pitcher. Here, it will be CC Sabathia.\n\n\n# connect to the sqlite3 database\ndb = sqlite3.connect(dbname)\nhdb = db.cursor()\n\n\n# get specific pitcher info\nquery = \"\"\"SELECT DISTINCT pitchfx.* \n FROM pitchfx\n JOIN events ON (pitchfx.game_id=events.game_id\n AND pitchfx.cur_event=events.event_id)\n WHERE events.pitcher_id=(SELECT player_id\n FROM players\n WHERE players.player_first='CC'\n AND players.player_last='Sabathia')\n ORDER BY game_id, pitch_num\"\"\"\ndf = pd.read_sql_query(query, db)\ndf.head()\n\n\n# ### Printed summaries of data\n\n# One of the first things you can do is to get a typed summary of the data set. We can do this in pandas, which has similar functions as in R. The *describe* function in pandas is similar to the *summary* function in R. Note that function only returns quantitative variables.\n\n\n# print out summary of pandas dataframe\n# df.describe()\n\n# transpose it to see all the stats\ndf.describe().transpose()\n\n\n# Note that you can also look at the stats for a single feature. Here, we'll look at *spin rate*.\n\n\n# show stats for spin rate\ndf['spin_rate'].describe()\n\n\n# Note that you can also store data in a different variable name and perform similar summary commands. Here, we will look at the percentiles (specifically the 0.1 and 0.5 percentile). The 50% percentile should match the median.\n\n\n# store spin rate in a different variable\nspin_rate = df[\"spin_rate\"]\nspin_rate.quantile([0.1, 0.5])\n\n\n# ### Boxplots\n\n# Boxplots are common when visualizing quartile information. We'll stay with *spin rate* for our boxplot. Before plotting the data, we'll need to remove the nans.\n#\n# Note that when extracting a single column from a dataframe, the column is now a *series* object. The series should be converted into a numpy array if plotting from matplotlib and not pandas (here we are plotting directly from a pandas object).\n#\n# The red line is the median, the edges of the box are the 25% and 75% quartiles, and the the lines below and above the box correspond to some factor (*whis* in the arguments) multiplied by the interquartile range (Q3-Q1). It is meant to help identify outliers.\n\n\n# remove nans\nspin_rate = spin_rate.dropna()\n\n# create boxplot\n#plt.boxplot(spin_rate.as_matrix(), whis=1.5)\nspin_rate.plot.box(whis=1.5)\nplt.show()\n\n\n# Now, let's create boxplots of velocity sorted by pitch types.\n\n\n# create plot\ndf.boxplot(column=\"start_speed\", by=\"pitch_type\")\nplt.show()\n\n\n# As we can see, Sabathia's fastball has the highest starting velocity, followed by his sinker, change-up, slider, and cutter.\n\n# Next, we'll look at how horizontal movement in inches (pfx_x) varies with pitch type.\n\n\n# create plot\ndf.boxplot(column=\"pfx_x\", by=\"pitch_type\")\nplt.show()\n\n\n# As we can see, horizontal movement varies dramatically between sliders and changes-ups. Perhaps surprisingly, data suggests that Sabathia's slider has less horizontal movement than his change-up. It is assumed that, from the hitter's and catcher's perspective, negative values correspond to movement right to left and positive values correspond to movement left to right (based on Sabathia's handedness and the natural break of the slider).\n\n# Now let's look at vertical movement.\n\n\n# create plot\ndf.boxplot(column=\"pfx_z\", by=\"pitch_type\")\nplt.show()\n\n\n# We see that many pitches have \"upward\" movement, which doesn't make sense. However, given the way movement is defined, this can make sense. For example, most pitches, with curveballs being the main exception, have some backspin, which causes them not to drop as much as a spinless pitch (https://www.beyondtheboxscore.com/2009/4/17/841366/understanding-pitch-f-x-graphs). Note that again, sliders and change-ups are well separated from each other, but many pitches have positive vertical movement.\n\n# We'll look more closely at how often each of these pitches is thrown in a later section.\n\n# ### Histograms\n\n# Next, we'll create a histogram of the same *spin_rate* data. We'll use *matplotlib* directly here. We'll show the effects of plotting histograms with different bin sizes side by side.\n\n\n# convert series data to numpy array before plotting\n# share y-axis to show difference in values\nf, (ax1, ax2) = plt.subplots(1, 2, sharey=True)\nax1.hist(spin_rate.as_matrix())\nax1.set_xlabel(\"Spin rate\")\nax1.set_xticks(np.arange(0, 4000, 1000))\nax2.hist(spin_rate.as_matrix(), bins=20)\nax2.set_xlabel(\"Spin rate\")\nax2.set_xticks(np.arange(0, 4000, 1000))\nplt.show()\n\n\n# We can also use seaborn to creat histograms with rug plots below. The rug plot is a fine-grain overview of the data, as opposed to a histogram, which bins the data. However, with so many data points the rug plot is not too useful. Note that the y-axis is normalized by default, as this is a distribution.\n\n\nfig, ax = plt.subplots()\nsns.distplot(spin_rate.as_matrix(), rug=True, hist=True, rug_kws={\"color\": \"g\"},\n kde_kws={\"color\": \"k\", \"lw\": 3})\nax.set_xlabel(\"Spin rate\")\nplt.show()\n\n\n# ### Bar charts\n\n# We will now create a bar chart of all the pitch types thrown by CC Sabathia. First, let's get a summary of the pitch type information.\n\n\n# describe pitch type information\ndf[\"pitch_type\"].describe()\n\n\n# There are NaNs in the data, so let's clean them out and put them in a new variable name.\n\n\npitch_type = df[\"pitch_type\"].dropna()\n\n\n# Next, let's create a bar plot of all the pitch types.\n\n\nfig, ax = plt.subplots()\npitch_type.value_counts().plot(ax=ax, kind=\"bar\")\nax.set_xlabel(\"Pitch type\")\nax.set_ylabel(\"Frequency\")\nplt.show()\n\n\n# We can see that Sabathia complements his four-seam fastball with a slider, change-up, and sinker. There are a few pitches classified as cutters, but they are unlikely to actually be cutters given how infrequent they are throughout the season.\n\n# ### Scatter plots\n\n# Here, we'll look at the raw data through scatter plots of various measurements. First, we'll pull up a summary of the pitchfx parameters that are available.\n\n\n# get variable names from pitch fx with data types\nprint(*[df.dtypes], sep=\"\\n\")\n\n\n# Most of these variables are going to be useful for classifying pitches. However, not all of them will be useful. Specifically, any variable not directly related to the trajectory of the ball. Therefore, we will chop off the first eleven variables, which have mostly to do with the times and situations at which the pitch was thrown. For a description of each of the features, have a look at: https://fastballs.wordpress.com/category/pitchfx-glossary.\n\n\n# create truncated pitchfx data frame\npdf = df.iloc[:, 11:].dropna()\n\n\n# Now we have a smaller data frame with fewer features. Let make a scatter plots of all these variables to see if there are any clear trends. Note that we will drop all rows containing NaNs to make the analysis easier. Typically where there are NaNs, it means that there was no measurment made of the pitch trajectory information. Therefore, we drop the entire row. We will also drop the last column, which is the pitch classification from MLB.\n#\n# Here, we will plot all features but only 100 pitches.\n\n\n#pd.scatter_matrix(pdf.iloc[:100, :-1].dropna(axis=0, how='all'), alpha=0.2, figsize=(10, 10))\npd.scatter_matrix(pdf.iloc[:1000, :-1].dropna(axis=0,\n how='all'), alpha=0.2, figsize=(10, 10))\nplt.show()\n\n\n# There is a lot of information in this scatter matrix, and it's difficult to pick out certain trends. However, we will try to explain some of what we see.\n#\n# 1) yo and break_y seem to be variables with a small spread of values. The former is the distance in feet from home plate where the PITCHf/x system is set to measure the initial parameters, and it is typically fixed after the year 2007. The latter is the distance in feet from home plate to the point in the pitch trajectory where the pitch achieved its greatest deviation from the straight line path between the release point and the front of home plate. This trait is potentially a fixed value for each game/stadium and does not appear to be very insightful.\n#\n# 2) There are very strong linear trends amongst certain features. Among them are x and px and y and pz. All these features are related to the location of the ball crossing the plate, with x and y tied to the old location system and px and pz related to the pitchfx system. Additionally, there seem to be strong trends between ax and pfx_x and az and pfx_z. This suggets that movement (in inches) is tied to acceleration (in ft/s), which is not surprising. Also, there is a strong correlation between start_speed and vy0, which is not surprising given that they are measuring speeds toward the plate.\n#\n# 3) There appears to be certain features that produce distinct clusters. For instance, start_speed and spin_rate, start_speed and pfx_x (horizontal movement), and start_speed and break_angle. These might be useful features when attempting to classify pitch types.\n\n# ### Cross plots\n\n# Let's focus on some of the more interesting plots. From point number 3, it seems that useful parameters to look at are end_speed, spin_rate, and pfx_x (horizontal movement). Let's put all of these into one crossplot, with end speed (mph) and spin rate (rpm) as the axes and horizontal movement as the color.\n\n\n# crossplot of start speed and spin rate, colored by horizontal movement\nplt.scatter(pdf[\"start_speed\"], pdf['spin_rate'],\n c=pdf['pfx_x'], cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"spin rate (rpm)\")\nplt.colorbar(label=\"horizontal movement (in)\")\nplt.show()\n\n\n# There are three clear clusters!\n#\n# The slowest pitch has a low spin rate and significant movement from right to left (from a left-handed pitcher), which suggests that this pitch is a slider. The second slowest cluster has a higher spin rate and velocity than the slider, and has movement from left to right, which suggests this pitch is his change-up. The third cluster appears to contain two types of pitches, based on the difference in horizontal movement. From looking at the previous bar charts of horizontal movement grouped by pitch type, the sinker has more horizontal movement than the four-seam fastball. Therefore, it looks like we could separate pitches in the third cluster with another axis.\n\n# Now let's compare our results to the given pitch classifications. We'll plot velocity against spin rate, and color by the pitch type.\n\n\n# crossplot of spin rate and start speed, colored by pitch type\nsns.pairplot(x_vars=\"start_speed\", y_vars=\"spin_rate\",\n data=pdf, hue=\"pitch_type\", size=5)\nplt.show()\n\n\n# We can see that two of our classifications (slider and changeup) are separated and clustered as we had hypothesized. We can also see that the sinker and four-seam fastball are overlain on the third cluster. However, the spread in spin rate of the sinker appears greater than hypothesized in the previous crossplot.\n\n# Let's get another view of the clusters. Here, we'll switch two of the axes: spin rate and horizontal movement. We will now color by spin rate.\n\n\n# crossplot of start speed and spin rate, colored by horizontal movement\nplt.scatter(pdf[\"start_speed\"], pdf['pfx_x'],\n c=pdf['spin_rate'], cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"horizontal movement (in)\")\nplt.colorbar(label=\"spin rate (rpm)\")\nplt.show()\n\n\n# It appears that the higher the spin rate, the greater the horizontal movement, which makes sense. However, it doesn't appear to separate sinkers and fastballs that clearly. We know that sinkers typically have more movement, so it's likely that the higher spin rate pitches are sinkers.\n\n# Let's plot the same scatter plot but color by the classifications from the dataset. We'll see how our hypothesis that sinkers have more spin compares to the given labels.\n\n\n# crossplot of start speed and horizontal movement, colored by spin rate\nsns.pairplot(x_vars=\"start_speed\", y_vars=\"pfx_x\",\n data=pdf, hue=\"pitch_type\", size=5)\nplt.xlabel(\"velocity (mph)\")\nplt.ylabel(\"horizontal movement (in)\")\nplt.show()\n\n\n# As we can see, the sinker has more horizontal movement compared to the four-seam fastball. Therefore, it seems like it's possible to separate the pitches given the features we have. Next, we'll perform some dimension reduction to see if we can separate these events better and to better-understand which features are important.\n\n# ### Dimension reduction\n\n# Dimension reduction is a way to transform the full data set into a lower-dimension representation. We'll look at principal component analysis (PCA) here, as it is a pretty standard method.\n\n# PCA performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. We can potentially separate these pitches in a different space, as well as infer which features are important for data separation.\n\n# As first step, we need to standardize the features. This is an important step here, as the features are very different units and have very different scales. We'll remove the means and divide by the variance of each feature. Remember to remove the categorical features here, as they won't be useful in this case. Additionally, we will remove numerical features that are just indices (e.g., pitch number and event number).\n\n# Note that an alternative to use *OneHotEncoder* to encode the categorical variables instead of dropping them (see: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder).\n\n# A note about sklearn. The pipeline is usually that you initialize your sklearn objects (e.g., standardizing variables, svm, pca, etc.). Next, you can run something like *fit*, *transform*, or *fit_transform*. *fit* means that the output is some sort of value(s). *transform* means that the output has the same dimensions as the input. Sometimes you run *fit* then *transform*, in which case you can use *fit_transform* if you don't care about saving the intermediate values.\n#\n# In the case of standardizing variables, the *fit* portion is when you calculate the mean and standard deviations of the features. The *transform* portion is the actual standardization of the features.\n\n\n# remove categorical variables\npfx_vals = df.drop(df.dtypes.index[df.dtypes.values == \"object\"], 1).dropna()\n\n# remove indexing variables\npfx_vals = pfx_vals.drop([\"game_id\",\n \"pitch_num\",\n \"at_bat\",\n \"time\",\n \"cur_event\"], axis=1)\n\n# this is how to standardize the variables in two steps\n# standardize the features\n#stand = pp.StandardScaler()\n#pfx_std = stand.fit(pfx_vals)\n#\n# now standardize the features\n#pfx_vals_std = pfx_std.transform(pfx_vals)\n\n# this is how to standardize the variables in one step\nstand = pp.StandardScaler()\npfx_vals_std = stand.fit_transform(pfx_vals)\n\n\n# Now let's run some PCA now that the features are standardized. We will only use *fit* here, as we're interested in looking at the principal components and the explained variance ratios. Note that we are also turning *whiten* on, so components_ vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions.\n\n\n# create pca\npca = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca = pca.fit(pfx_vals_std)\n\n\n# Let's plot the percent of variance explained by each principal component and the cumulative sum of the variance explained on the same plot. This type of plot can help determine how many principal components to keep for data compression or how much variance can be explained when visualizing the data.\n\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 34])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n\n# There are a few observations we can make. First, note that we've added a black horizontal dashed line at 90% of the data variance explained. We can see that we can explain 90% of variance in the data with just 8 principal components (as opposed to the full 33 components). Therefore, we can greatly compress the size of our data set if we are willing to be unable to explain 10% of the variance. Second, there is a clear \"elbow\" after the first principal component in our portion of variance explained plot. In other words, while the first principal component only explains roughly 40% of the variance in the data, further principal components do not explain a substantial amount of variance in the data. Therefore, nearly half the variance in the data is explained by the first two principal components, which is useful to know for visualizing our data in 2D.\n\n# So, let's plot our data's PCA scores over the first two principal components, colored by the third principal component.\n\n\n# transform the data into pca space\npfx_pca_trans = pfx_pca.transform(pfx_vals_std)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans[:, 0], pfx_pca_trans[:, 1], c=pfx_pca_trans[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# As we can see, there is a clear separation along the first principal component into two clusters. This trend of two clusters explains why the first principal component already explains 40% of the variance in the data. The second principal component shows relatively less variation along its axis, and the third (color) less than that. It appears that there is not much clear information about different types of pitches here (thought a third cluster might be interpretable in the figure).\n\n# Note that we can get the PCA transforms ourselves without having to use the *transform* feature. We can get the same result by multiplying the original scaled features by the components matrix. For instance, we can get the first principal component projects using the code below.\n\n\n# first principal component\nloadings1 = pfx_pca.components_[0, :]\nprint(np.dot(pfx_vals_std, loadings1))\nprint(pfx_pca_trans[:, 0])\n\n\n# Let's again plot the first two principal components, but color by the labeled pitches from the database.\n\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame(\n {\"pca1\": pfx_pca_trans[:, 0], \"pca2\": pfx_pca_trans[:, 1], \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Here, we see that the sliders are the easiest pitch to separate, as they are clearly in their own cluster. However, we see that the rest of the three pitches are grouped into one cluster (changeup, sinker, fastball). These three pitches are spread simiarly along the second principal component axis. Therefore, PCA doesn't seem to be helping to separate pitches much here.\n\n# Let's take a closer look at which features the principal components are extracting at the \"most important\" in terms of variance. To find the most important variables in terms of their contributions to the principal component, we will look at loadings. Loadings are the projections of the principal components onto your variables. A particularly high (or particularly low) loading for a specific variable means that principal component is intimately related to the variable\n#\n# We'll do this by looking for the largest values in the first principal component (first row of the component matrix here), and then the second principal component. Note that the components matrix has the shape (n_components, n_features). A biplot would be useful here... Instead, we use plot bar charts of the loadings and inspect important features manually.\n\n\n# first principal component\nloadings1 = pfx_pca.components_[0, :]\n\n# intialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals.shape[1]), list(pfx_vals), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals.shape[1]), list(pfx_vals), rotation=90)\nplt.show()\n\n\n# Let's focus on the first principal component first (left). We see that many of the features that relate to the ball trajectory (e.g., velocity, break length/angle, acceleration, movement) appear to be the most important features for the first principal component. From our previous analysis, we know that we can obtain relatively clear clusters plotting only these parameters.\n#\n# Looking at the second principal component (right), it seems that the most important features are related to mostly the location of the pitches. While this is useful information, it is unlike to be much use in classifying pitches.\n#\n# We will do some further \"cleaning\" of the data to tailor to the goal of pitch clustering. Specifically, we will remove all features that do not contribute to ball trajectory (i.e., pitch locations) and see what we get.\n\n\n# remove unimportant features\npfx_vals_clean = pfx_vals.drop([\"pre_balls\",\n \"post_balls\",\n \"pre_strike\",\n \"post_strike\",\n \"sz_top\",\n \"sz_bot\",\n \"x\",\n \"y\",\n \"px\",\n \"pz\",\n \"x0\",\n \"y0\",\n \"z0\",\n \"break_y\"], axis=1)\n\n# standardize data\npfx_vals_std_clean = stand.fit_transform(pfx_vals_clean)\n\n\n# Now let's look at PCA again with the new cleaned data.\n\n\n# create pca\npca_clean = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca_clean = pca_clean.fit(pfx_vals_std_clean)\n\n\n# As before, let's plot the percent of variance explained by each principal component and the cumulative sum of the variance explained on the same plot.\n\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca_clean.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca_clean.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 25])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n\n# We can see that the first principal component already explains 70% of the variance in the dataset.\n\n# Let's plot the PCA scores again for the first three principal components.\n\n\n# transform the data into pca space\npfx_pca_trans_clean = pfx_pca_clean.transform(pfx_vals_std_clean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_clean[:, 0], pfx_pca_trans_clean[:, 1], c=pfx_pca_trans_clean[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# We see that the data is well separated along the first principal component but not as well along the second principal component, as expected.\n\n# Again, let's plot the first two principal components, but color by the labeled pitches from the database.\n\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame(\n {\"pca1\": pfx_pca_trans_clean[:, 0], \"pca2\": pfx_pca_trans_clean[:, 1], \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Again, the sliders are well-separated but the other three pitches are not.\n\n# Again, let's look at the loadings of the first two principal components to get a sense of which features might be significant.\n\n\n# initialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# first principal component\nloadings1 = pfx_pca_clean.components_[0, :]\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca_clean.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\nplt.show()\n\n\n# Let's also look at the loadings for the third and fourth principal components.\n\n\n# initialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# first principal component\nloadings3 = pfx_pca_clean.components_[2, :]\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings3), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 3\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\n# second principal component\nloadings4 = pfx_pca_clean.components_[3, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_clean.shape[1]), np.abs(\n loadings4), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 4\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_clean.shape[1]), list(pfx_vals_clean), rotation=90)\n\nplt.show()\n\n\n# From these plots, we see that all features have similar loadings along the first principal component. This suggests that most of our features display similar variance. We also see that the third principal component corresponds roughly to intial velocity in the horizontal direction, and it does not provide much insight into pitch type separation. A biplot could help determine how much these features are correlated to each other.\n\n\n# transform the data into pca space\npfx_pca_trans_clean = pfx_pca_clean.transform(pfx_vals_std_clean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_clean[:, 1],\n pfx_pca_trans_clean[:, 2],\n c=pfx_pca_trans_clean[:, 0],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame({\"pca1\": pfx_pca_trans_clean[:, 1],\n \"pca2\": pfx_pca_trans_clean[:, 2],\n \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# Now we'll try with an even cleaner data set, where we are only keeping parameters that we suspect are not correlated.\n\n\n# remove unimportant features\npfx_vals_vclean = pfx_vals[[\"pfx_x\",\n \"pfx_z\",\n \"spin_rate\",\n \"start_speed\"]]\n\n# standardize data\npfx_vals_std_vclean = stand.fit_transform(pfx_vals_vclean)\n\n\n# create pca\npca_vclean = dd.PCA(whiten=False)\n\n# run pca in two steps\npfx_pca_vclean = pca_vclean.fit(pfx_vals_std_vclean)\n\n\n# plot variance ratio against principal component\nfig, ax1 = plt.subplots()\nax1.plot(pfx_pca_vclean.explained_variance_ratio_, \"o-\", color=\"b\")\nax1.set_xlabel(\"principal component\")\nax1.set_ylabel(\"explained variance ratio\")\nax1.set_ylim([-0.01, 1.1])\nax1.yaxis.label.set_color('b')\n\n# plot cumulative sum of explained variance ratio against principal component\nax2 = ax1.twinx()\nax2.step(np.cumsum(pfx_pca_vclean.explained_variance_ratio_), \"o-\", color=\"r\")\nax2.set_ylabel(\"cumulative sum of explained variance ratio\")\nax2.set_xlim([-1, 25])\nax2.set_ylim([-0.01, 1.1])\nax2.yaxis.label.set_color('r')\nax2.axhline(y=0.9, color=\"k\", ls=\"dashed\")\nplt.show()\n\n\n# transform the data into pca space\npfx_pca_trans_vclean = pfx_pca_vclean.transform(pfx_vals_std_vclean)\n\n# plot in pca space\nplt.scatter(pfx_pca_trans_vclean[:, 0],\n pfx_pca_trans_vclean[:, 1],\n c=pfx_pca_trans_vclean[:, 2],\n cmap=\"jet\", alpha=0.2)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.colorbar(label=\"PCA 3\")\nplt.clim(-3, 3)\nplt.show()\n\n\n# create dataframe for plotting\ndf_pca = pd.DataFrame({\"pca1\": pfx_pca_trans_vclean[:, 0],\n \"pca2\": pfx_pca_trans_vclean[:, 1],\n \"pitch_type\": np.array(pdf.pitch_type)}, )\n\n# plot in pca space, colored by pitch type from database\nsns.pairplot(x_vars=\"pca1\", y_vars=\"pca2\",\n data=df_pca, hue=\"pitch_type\", size=5)\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# first principal component\nloadings1 = pfx_pca_vclean.components_[0, :]\n\n# intialize figure\nfig, ax = plt.subplots(1, 2, figsize=(12, 4))\n\n# plot barchart of loadings1\nax[0].bar(range(pfx_vals_vclean.shape[1]), np.abs(\n loadings1), align='center', alpha=0.5)\nax[0].set_ylabel(\"loadings of PCA 1\")\nplt.sca(ax[0])\nplt.xticks(range(pfx_vals_vclean.shape[1]), list(pfx_vals_vclean), rotation=90)\n\n# second principal component\nloadings2 = pfx_pca_vclean.components_[1, :]\n\n# plot barchart of loadings1\nax[1].bar(range(pfx_vals_vclean.shape[1]), np.abs(\n loadings2), align='center', alpha=0.5)\nax[1].set_ylabel(\"loadings of PCA 2\")\nplt.sca(ax[1])\nplt.xticks(range(pfx_vals_vclean.shape[1]), list(pfx_vals_vclean), rotation=90)\nplt.show()\n\n\n# Overall, we see that PCA does not seem to be able to separate pitch clusters more clearly. We still see three clusters for the most part, with the troublesome spot being the difference between the four-seam fastball (FF) and the sinker (SI).\n\n# ### Hierarchical clustering\n\n# We'll perform hierarchical clustering here to try to group the data. We start with hierarchical clustering because it is useful when there is no prior knowledge of the number of clusters (although that isn't exactly the case here). We will look at bottom-up clustering here. All data points begin as their own cluster, in some ways. Then we fuse data points together that are similar to each other, and continue until all data points are in one cluster.\n#\n# There are some controls over how clusters are fused. There are four common types of linkage (complete, average, single, centroid). There are also dissimilarity measures, such as Euclidean distance and correlation-based distances. For the case of pitch classification, we will start for Euclidean distance, as we're looking for feature values with similar values.\n\n# We will use scipy for this sort of clustering. We will perform hierarchical clustering with complete linkage and euclidean distances using the data with trajectory information only.\n\n\n# generate the linkage matrix\nZc = hier.linkage(pfx_vals_std_clean, method='complete', metric='euclidean')\n\n\n# Let's try to plot a dendrogram.\n\n\n# plot the dendrogram colored by a certain height\nheightc = 11\nfig = plt.figure(figsize=(25, 10))\ndn = hier.dendrogram(Zc, color_threshold=heightc, leaf_rotation=90)\n\n\n# As we can see, there appears to be three to four natural clusters when using complete linkage (we highlighted three here). However, one of the clusters is relative small and leads to an unbalanced tree. Let's see what these clusters correspond to on a plot of starting speed vs horizontal movement.\n\n\n# labels\nlabel = hier.fcluster(Zc, heightc, 'distance')\n\n# values for plot\nhor_mv = pfx_vals_vclean[\"pfx_x\"]\nspeed = pfx_vals_vclean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=label, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We can see that complete linkage doesn't do a very good job here of clustering, as two seemingly obvious clusters are grouped together into one. Let's try ward linkage now to see if we can obtain a more balanced tree. Ward minimizes the variance of the clusters being merged.\n\n\n# generate the linkage matrix\nZw = hier.linkage(pfx_vals_std_clean, method='ward', metric='euclidean')\n# plot the dendrogram colored by a certain height\nheightw = 60\nfig = plt.figure(figsize=(25, 10))\ndn = hier.dendrogram(Zw, color_threshold=heightw, leaf_rotation=90)\n\n\n# labels\nlabel = hier.fcluster(Zw, 50, 'distance')\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=label, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It appears that we get three reasonable clusters.\n\n# ### Density plots\n\n# We know that the fastball and sinker are difficult to separate. Let's create density plots to see if there are potentially smaller clusters hidden in the three obvious clusters from hierarchical clustering.\n\n# Start by isolating the blue cluster in the previous plot, which contains slow pitches than have a lot of break from right to left. We are likely looking at sliders here.\n\n\n# label of largest cluster\nlabel = 1\n\n# find indices\nc1_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c1_ind], speed.iloc[c1_ind],\n color=\"blue\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# As we can see, there appears to be only one cluster in here, which was expected. This is likely Sabathia's slider.\n\n# Next, we'll look at the green cluster, which is slightly faster than the slider but breaks from left to right.\n\n\n# label of largest cluster\nlabel = 2\n\n# find indices\nc2_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c2_ind], speed.iloc[c2_ind],\n color=\"green\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# Here, we can see that again, there is only one cluster, which likely corresponds to Sabathia's change-up. However, the spread of values is surprisingly wide.\n\n# Finally, we'll focus on the largest (red) cluster, which presumably has the four-seam fastball and sinker. This is the key problem, as we have not been able to separate the two pitch types through PCA. We'll look for whether there are multiple dense clusters that are just hidden.\n\n\n# label of largest cluster\nlabel = stats.mode(hier.fcluster(Zw, 50, 'distance'))[0][0]\n\n# find indices\nc3_ind = np.where(hier.fcluster(Zw, 50, 'distance') == label)[0]\n\n# project label onto data points\nsns.jointplot(hor_mv.iloc[c3_ind], speed.iloc[c3_ind],\n color=\"red\", kind=\"hex\", xlim=[-20, 20], ylim=[70, 100])\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It appears that there is only only cluster here, although it spans a wide range of velocities and horizontal movement. It is possible that the sinker is just a four-seamer with more break. Regardless, it does not appear that these two particular pitches can be separated.\n\n# Based on the spread of the change-up cluster, we'll look at DBSCAN and see if it does a better job.\n\n# ### DBSCAN\n\n# Density-based spatial clustering of applications with noise (DBSCAN) groups together data observations that are close together. This approach has the advantages that, again, the number of clusters beforehand does not need to be known, it is robust to outliers (or noise features that are not truly associated with the response), and can find arbitrarily shaped clusters.\n#\n# The algorithm (https://algorithmicthoughts.wordpress.com/2013/05/29/machine-learning-dbscan/):\n#\n# 1) make an n-dimensional sphere of radius epsilon around the point and count the number of data points within the sphere.\n#\n# 2) if the number of points within the sphere are more than min_points then we mark the center of the sphere to be belonging to a cluster. We also mark the points inside the sphere to be belonging to the same cluster. We then recursively expand the cluster by applying the same criteria to the points inside the sphere, except the center.\n#\n# 3) in case the number of points inside the sphere are less than min_points, we ignore it and proceed to the next point in the dataset.\n#\n# For the algorithm (https://en.wikipedia.org/wiki/DBSCAN), we need to define three parameters: the minimum number of points to create a dense region, the maximum distance between two samples for data observations to be considered as in the same neighborhood, and a distance metric. The first has a rule of thumb of 2\\*(dimensionality of data), and larger values are usually better for data sets with noise and will yield more significant clusters. The second can be estimated from a K-distance plot, looking for an elbow point. The third we'll use euclidean distance here.\n\n# Let's first create a K-nearest-neighbor distance plot to try to estimate the epsilon parameter, which affects the size and number of clusters. Since our data has 14 features, we will set our `min_points` parameter to 28. Have a look at http://www.sthda.com/english/articles/30-advanced-clustering/105-dbscan-density-based-clustering-essentials/\n\n\n# set min_points parameter\nmin_points = 2*pfx_vals_std_clean.shape[1]\n\n# initialize nearest-neighbor object\nnbrs = nn.NearestNeighbors(n_neighbors=min_points, metric=\"euclidean\")\n\n# fit nearest-neighbors\nnbrs_fit = nbrs.fit(pfx_vals_std_clean)\nkdist, kind = nbrs_fit.kneighbors(pfx_vals_std_clean)\n\n\n# `kdist` contains distance values from points to neighbors. `kind` contains corresponding to indices of the nearest neighbors. Let's plot the distance of the `min_points`-1 (here 27) nearest neighbor for each data point, sorted by distance.\n\n\n# get distances and sort\nk27 = np.sort(kdist[:, -1])\n\n# plot sorted distance as a function of data point index\nplt.plot(k27)\nplt.xlabel(\"observation index\")\nplt.ylabel(\"euclidean distance\")\nplt.show()\n\n\n# Looking at this plot, it seems like an `eps` of 1.75 might be a reasonable value, as it is the elbow point. However, we will choose a lower value of 1, as this plot is potentially biased by the first few indices. Now that we have our parameters, we can try out DBSCAN.\n\n\n# initialize variables\neps = 1\n\n# initialize dbscan object\ndbscan = clust.DBSCAN(eps=eps, min_samples=min_points, metric=\"euclidean\")\n\n# fit and predict labels with dbscan\ndbscan_pred = dbscan.fit_predict(pfx_vals_std_clean)\n\n\n# Let's take a look at the cluster results in our usual scatter plot.\n\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We can see that two clear clusters are separated but have many outliers within them. This is potentially related to the curse of dimensionality, as adding noise features that are not truly associated with the reponse will hurt density measurements. Instead, we will try DBSCAN in PCA space, where we now only have two dimensions. We will also increase the minimum number of points needed for a core point.\n\n\n# get pca data and scale\npca_data = pfx_pca_trans_clean[:, :2]\nstand = pp.StandardScaler()\npca_stand = stand.fit_transform(pca_data)\n\n# set min_points parameter\nmin_points = 8*pca_stand.shape[1]\n\n# initialize nearest-neighbor object\nnbrs = nn.NearestNeighbors(n_neighbors=min_points, metric=\"euclidean\")\n\n# fit nearest-neighbors\nnbrs_fit = nbrs.fit(pca_stand)\nkdist, kind = nbrs_fit.kneighbors(pca_stand)\n\n# get distances and sort\nk3 = np.sort(kdist[:, -1])\n\n# plot sorted distance as a function of data point index\nplt.plot(k3)\nplt.xlabel(\"observation index\")\nplt.ylabel(\"euclidean distance\")\nplt.show()\n\n\n# Again, there is an elbow point, but a smaller epsilon value than 0.2 might be useful here, as two clusters of the three clusters in PCA space are not well separated (but still distinguishable).\n\n\n# initialize variables\neps = 0.16\n\n# initialize dbscan object\ndbscan = clust.DBSCAN(eps=eps, min_samples=min_points, metric=\"euclidean\")\n\n# fit and predict labels with dbscan\ndbscan_pred = dbscan.fit_predict(pca_stand)\n\n\n# We'll project our cluster labels into PCA space to see how DBSCAN did visually.\n\n\n# values for plot\n#hor_mv = pfx_vals_clean[\"pfx_x\"]\n#speed = pfx_vals_clean[\"start_speed\"]\nhor_mv = pca_data[:, 0]\nspeed = pca_data[:, 1]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"PCA 1\")\nplt.ylabel(\"PCA 2\")\nplt.show()\n\n\n# We can see that the most dense parts of the clusters are well separated, with a lot of outliers also identified. Let's see what the classifications look like in actual feature space.\n\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=dbscan_pred, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# It looks fairly similar to the results using all trajectory features. However, most of the outliers are along the edges of the clusters, unlike when using all trajectory features.\n\n# Overall, it seems that Sabathia has three primary types of pitches:\n#\n# 1) \"Hard\" pitches; ones that comes in fast with various amounts of horizontal break, which determine the difference between four-seam fastballs and sinkers (yellow cluster above)\n#\n# 2) \"Off-speed\" pitches; ones that have similar horizontal break as \"hard\" pitches, but slower (blue cluster above)\n#\n# 3) \"Breaking\" pitches; ones that have very different horizontal break compared to the other two pitches (red cluster above)\n\n# ### K-means clustering\n\n# We'll now try K-means clustering. Every data point will be a part of a cluster, but we need to specify the number of clusters beforehand. Here we set it to 3, and also run Kmeans on the trajectory information.\n\n\n# initialize parameters\nnclust = 3\nniter = 500\n\n# run kmeans and predict label\nklabels = clust.KMeans(n_clusters=nclust, random_state=0,\n n_init=niter).fit_predict(pfx_vals_std_clean)\n\n\n# Let's plot the results in our usual domain.\n\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=klabels, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# Next, we'll try clustering in our PCA space.\n\n\n# run kmeans and predict label\nklabels = clust.KMeans(n_clusters=nclust, random_state=0,\n n_init=niter).fit_predict(pca_stand)\n\n# values for plot\nhor_mv = pfx_vals_clean[\"pfx_x\"]\nspeed = pfx_vals_clean[\"start_speed\"]\n\n# project label onto data points\nplt.scatter(hor_mv, speed, c=klabels, cmap=\"jet\")\nplt.xlabel(\"horizontal movement (in)\")\nplt.ylabel(\"start speed (mph)\")\nplt.show()\n\n\n# We see that the cluster results in PCA space lead to less-convincing clusters. This is expected, given that the clusters are not as well separated in PCA space as in start speed and horizontal movement space.\n\n# ### Feature correlation\n\n# Highly-correlated features can be an issue, particularly when using Euclidean distances as a metric. This is why we turned to clustering in PCA space, as PCA in some ways decorrelates the features through an orthogonal linear transformation. Here, we'll look at the correlation matrix of features to find features that are highly correlated.\n\n\n# create correlation matrix of features\ncof = np.corrcoef(pfx_vals_clean.T)\n", "project_metadata": {"full_name": "jasonpchang/pitchfx_sql", "description": "Create sql database of pitchfx data", "topics": [], "git_url": "git://github.com/jasonpchang/pitchfx_sql.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2017-10-17T20:06:05Z", "size": 29396, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 17171734, "Python": 109402, "Makefile": 441}, "last_updated": "2019-04-21T03:11:01Z"}, "intent": "# find features with high correlation"}, {"original_comment": "# Make a 10x10 plot of images\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Gaussian Mixture Models\n# In this notebook we will take a look at Gaussian mixture models (GMMs), which can be viewed as an extension of the ideas behind k-means, but can also be a powerful tool for estimation beyond simple clustering.\n# We begin with the standard imports:\n\n#%%\n\nimport seaborn as sns\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import load_digits\nfrom sklearn.datasets import make_circles\nfrom matplotlib.patches import Ellipse\nfrom sklearn.mixture import GaussianMixture\nfrom sklearn.datasets.samples_generator import make_blobs\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set()\n\n\n# A Gaussian mixture model (GMM) attempts to find a mixture of multi-dimensional Gaussian probability distributions that best model any input dataset. In the simplest case, GMMs can be used for finding clusters in the same manner as k-means. Generate 400 samples divided in 4 blobs with `cluster_std=0.60, random_state=0`.\n\n#%%\n\n# Generate some data\nX, y_true = make_blobs(n_samples=400, centers=4,\n cluster_std=0.60, random_state=0)\n\n\n# Now import the `GaussianMixture` object from `sklearn.mixture`. Fit a gaussian mixture with 4 components. The method `.predict()` of this object will assign each point to the most probable component.\n\n#%%\n\ngmm = GaussianMixture(n_components=4, random_state=42).fit(X)\nlabels = gmm.predict(X)\nplt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')\nplt.axis('equal')\n\n\n# Precisely because GMM is a probabilistic model, it is also possible to find probabilistic cluster assignments. In Scikit-Learn this is done using the `.predict_proba` method. Compute the probabilities for the dataset you have generated, print the shape of the ouput. What are the dimensions? Compute the sum for each row, interpret the results.\n\n#%%\n\nproba = gmm.predict_proba(X)\nproba.shape\n\n#%%\n\nproba.sum(axis=1)\n\n\n# If we want to obtain the abolute probabilities for each sample we can use the `score_samples` method. This will compute the log probability density for each sample. Compute the probabilities for each example of the previous dataset.\n\n#%%\n\nnp.exp(gmm.score_samples(X))\n\n\n# The following code generates a plot of the gmm fitted to a set of points and the shape of the fitted gaussian distributions\n\n#%%\n\ndef draw_ellipse(position, covariance, ax=None, **kwargs):\n \"\"\"Draw an ellipse with a given position and covariance\"\"\"\n ax = ax or plt.gca()\n\n # Convert covariance to principal axes\n if covariance.shape == (2, 2):\n U, s, Vt = np.linalg.svd(covariance)\n angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))\n width, height = 2 * np.sqrt(s)\n elif covariance.shape == (2,):\n angle = 0\n width, height = 2 * np.sqrt(covariance)\n else:\n angle = 0\n height = 2 * np.sqrt(covariance)\n width = 2 * np.sqrt(covariance)\n\n # Draw the Ellipse\n for nsig in range(1, 4):\n ax.add_patch(Ellipse(position, nsig * width, nsig * height,\n angle, **kwargs))\n\n\ndef plot_gmm(gmm, X, label=True, ax=None):\n ax = ax or plt.gca()\n labels = gmm.fit(X).predict(X)\n if label:\n ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')\n else:\n ax.scatter(X[:, 0], X[:, 1], s=40)\n ax.axis('equal')\n\n w_factor = 0.2 / gmm.weights_.max()\n if gmm.covariance_type == 'tied':\n for pos, w in zip(gmm.means_, gmm.weights_):\n draw_ellipse(pos, gmm.covariances_,\n alpha=w * w_factor, color='red')\n else:\n for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):\n draw_ellipse(pos, covar, alpha=w * w_factor, color='red')\n\n\n# Plot the Gaussian Mixture for the dataset you have created.\n\n#%%\n\nplot_gmm(gmm, X)\n\n\n# Similarly, we can use the GMM approach to fit a dataset with stretched clusters. The following code will stretch out the previous points.\n\n#%%\n\nrng = np.random.RandomState(13)\nX_stretched = np.dot(X, rng.randn(2, 2))\n\n\n# Now fit the GaussianMixture to this new data and plot the results. Try different covariance types\n\n#%%\n\ngmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42)\nplot_gmm(gmm, X_stretched, label=True)\n\n\n# ## GMM as density estimation\n# Though GMM can be used as a clustering algorithm, fundamentally it is an algorithm for density estimation. The result of a GMM fit to some data is technically not a clustering model, but a generative probabilistic model describing the distribution of the data. As an example, generate the circles from sklearn. Generate 200 samples\n\n#%%\n\nX, y = make_circles(n_samples=200, noise=0.06, random_state=0, factor=0.4)\n\n#%%\n\nplt.scatter(X[:, 0], X[:, 1], s=40)\nplt.axis('equal')\n\n#%%\n\ngmm2 = GaussianMixture(n_components=2, covariance_type='full', random_state=0)\nplot_gmm(gmm2, X)\n\n#%%\n\ngmm = GaussianMixture(n_components=20, covariance_type='full', random_state=0)\nplot_gmm(gmm, X, label=False)\n\n\n# Here the mixture of 20 Gaussians serves not to find separated clusters of data, but rather to model the overall distribution of the input data. This is a generative model of the distribution, meaning that the GMM gives us the recipe to generate new random data distributed similarly to our input. Now generate new data from the fitted distribution. You may generate points with the method `sample`. Generate 200 points and plot them.\n\n#%%\n\nXnew = gmm.sample(200)\nplt.scatter(Xnew[0][:, 0], Xnew[0][:, 1])\nplt.axis('equal')\n\n\n# ## How do we choose the appropriate number of components?\n#\n# The fact that GMM is a generative model gives us a natural means of determining the optimal number of components for a given dataset. A generative model is inherently a probability distribution for the dataset, and so we can simply evaluate the likelihood of the data under the model, using cross-validation to avoid over-fitting. Another means of correcting for over-fitting is to adjust the model likelihoods using some analytic criterion such as the Akaike information criterion (AIC) or the Bayesian information criterion (BIC). Scikit-Learn's GMM estimator actually includes built-in methods that compute both of these, and so it is very easy to operate on this approach.\n#\n# Let's look at the AIC and BIC as a function as the number of GMM components for our circles dataset. Generate models with number of components between 4 and 30, `random_sate=0` and store the BIC and AIC for each model. Finally plot them in a single plot.\n\n#%%\n\nn_components = np.arange(4, 30)\nmodels = [GaussianMixture(n, covariance_type='full', random_state=0).fit(X)\n for n in n_components]\n\nplt.plot(n_components, [m.bic(X) for m in models], label='BIC')\nplt.plot(n_components, [m.aic(X) for m in models], label='AIC')\nplt.legend(loc='best')\nplt.xlabel('n_components')\n\n\n# Extract the number of components that minimizes the AIC and the BIC. You may do so with `np.argmax`\n\n#%%\n\nnp.argmin([m.aic(X) for m in models]) + 4\n\n\n# ## Generating images of hand-written digits\n# We just saw a an example of using GMM as a generative model of data in order to create new samples from the distribution defined by the input data. Here we will run with this idea and generate new handwritten digits from the MNIST dataset. Load the dataset with `load_digits`.\n\n#%%\n\ndigits = load_digits()", "target_code": "fig, ax = plt.subplots(10, 10, figsize=(8, 8),\n subplot_kw=dict(xticks=[], yticks=[]))\nfig.subplots_adjust(hspace=0.05, wspace=0.05)\nfor i, axi in enumerate(ax.flat):\n im = axi.imshow(digits.data[i].reshape(8, 8), cmap='binary')\n im.set_clim(0, 16)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Gaussian Mixture Models\n# In this notebook we will take a look at Gaussian mixture models (GMMs), which can be viewed as an extension of the ideas behind k-means, but can also be a powerful tool for estimation beyond simple clustering.\n# We begin with the standard imports:\n\n\nimport seaborn as sns\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import load_digits\nfrom sklearn.datasets import make_circles\nfrom matplotlib.patches import Ellipse\nfrom sklearn.mixture import GaussianMixture\nfrom sklearn.datasets.samples_generator import make_blobs\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set()\n\n\n# A Gaussian mixture model (GMM) attempts to find a mixture of multi-dimensional Gaussian probability distributions that best model any input dataset. In the simplest case, GMMs can be used for finding clusters in the same manner as k-means. Generate 400 samples divided in 4 blobs with `cluster_std=0.60, random_state=0`.\n\n\n# Generate some data\nX, y_true = make_blobs(n_samples=400, centers=4,\n cluster_std=0.60, random_state=0)\n\n\n# Now import the `GaussianMixture` object from `sklearn.mixture`. Fit a gaussian mixture with 4 components. The method `.predict()` of this object will assign each point to the most probable component.\n\n\ngmm = GaussianMixture(n_components=4, random_state=42).fit(X)\nlabels = gmm.predict(X)\nplt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')\nplt.axis('equal')\n\n\n# Precisely because GMM is a probabilistic model, it is also possible to find probabilistic cluster assignments. In Scikit-Learn this is done using the `.predict_proba` method. Compute the probabilities for the dataset you have generated, print the shape of the ouput. What are the dimensions? Compute the sum for each row, interpret the results.\n\n\nproba = gmm.predict_proba(X)\nproba.shape\n\n\nproba.sum(axis=1)\n\n\n# If we want to obtain the abolute probabilities for each sample we can use the `score_samples` method. This will compute the log probability density for each sample. Compute the probabilities for each example of the previous dataset.\n\n\nnp.exp(gmm.score_samples(X))\n\n\n# The following code generates a plot of the gmm fitted to a set of points and the shape of the fitted gaussian distributions\n\n\ndef draw_ellipse(position, covariance, ax=None, **kwargs):\n \"\"\"Draw an ellipse with a given position and covariance\"\"\"\n ax = ax or plt.gca()\n\n # Convert covariance to principal axes\n if covariance.shape == (2, 2):\n U, s, Vt = np.linalg.svd(covariance)\n angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))\n width, height = 2 * np.sqrt(s)\n elif covariance.shape == (2,):\n angle = 0\n width, height = 2 * np.sqrt(covariance)\n else:\n angle = 0\n height = 2 * np.sqrt(covariance)\n width = 2 * np.sqrt(covariance)\n\n # Draw the Ellipse\n for nsig in range(1, 4):\n ax.add_patch(Ellipse(position, nsig * width, nsig * height,\n angle, **kwargs))\n\n\ndef plot_gmm(gmm, X, label=True, ax=None):\n ax = ax or plt.gca()\n labels = gmm.fit(X).predict(X)\n if label:\n ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')\n else:\n ax.scatter(X[:, 0], X[:, 1], s=40)\n ax.axis('equal')\n\n w_factor = 0.2 / gmm.weights_.max()\n if gmm.covariance_type == 'tied':\n for pos, w in zip(gmm.means_, gmm.weights_):\n draw_ellipse(pos, gmm.covariances_,\n alpha=w * w_factor, color='red')\n else:\n for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):\n draw_ellipse(pos, covar, alpha=w * w_factor, color='red')\n\n\n# Plot the Gaussian Mixture for the dataset you have created.\n\n\nplot_gmm(gmm, X)\n\n\n# Similarly, we can use the GMM approach to fit a dataset with stretched clusters. The following code will stretch out the previous points.\n\n\nrng = np.random.RandomState(13)\nX_stretched = np.dot(X, rng.randn(2, 2))\n\n\n# Now fit the GaussianMixture to this new data and plot the results. Try different covariance types\n\n\ngmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42)\nplot_gmm(gmm, X_stretched, label=True)\n\n\n# ## GMM as density estimation\n# Though GMM can be used as a clustering algorithm, fundamentally it is an algorithm for density estimation. The result of a GMM fit to some data is technically not a clustering model, but a generative probabilistic model describing the distribution of the data. As an example, generate the circles from sklearn. Generate 200 samples\n\n\nX, y = make_circles(n_samples=200, noise=0.06, random_state=0, factor=0.4)\n\n\nplt.scatter(X[:, 0], X[:, 1], s=40)\nplt.axis('equal')\n\n\ngmm2 = GaussianMixture(n_components=2, covariance_type='full', random_state=0)\nplot_gmm(gmm2, X)\n\n\ngmm = GaussianMixture(n_components=20, covariance_type='full', random_state=0)\nplot_gmm(gmm, X, label=False)\n\n\n# Here the mixture of 20 Gaussians serves not to find separated clusters of data, but rather to model the overall distribution of the input data. This is a generative model of the distribution, meaning that the GMM gives us the recipe to generate new random data distributed similarly to our input. Now generate new data from the fitted distribution. You may generate points with the method `sample`. Generate 200 points and plot them.\n\n\nXnew = gmm.sample(200)\nplt.scatter(Xnew[0][:, 0], Xnew[0][:, 1])\nplt.axis('equal')\n\n\n# ## How do we choose the appropriate number of components?\n#\n# The fact that GMM is a generative model gives us a natural means of determining the optimal number of components for a given dataset. A generative model is inherently a probability distribution for the dataset, and so we can simply evaluate the likelihood of the data under the model, using cross-validation to avoid over-fitting. Another means of correcting for over-fitting is to adjust the model likelihoods using some analytic criterion such as the Akaike information criterion (AIC) or the Bayesian information criterion (BIC). Scikit-Learn's GMM estimator actually includes built-in methods that compute both of these, and so it is very easy to operate on this approach.\n#\n# Let's look at the AIC and BIC as a function as the number of GMM components for our circles dataset. Generate models with number of components between 4 and 30, `random_sate=0` and store the BIC and AIC for each model. Finally plot them in a single plot.\n\n\nn_components = np.arange(4, 30)\nmodels = [GaussianMixture(n, covariance_type='full', random_state=0).fit(X)\n for n in n_components]\n\nplt.plot(n_components, [m.bic(X) for m in models], label='BIC')\nplt.plot(n_components, [m.aic(X) for m in models], label='AIC')\nplt.legend(loc='best')\nplt.xlabel('n_components')\n\n\n# Extract the number of components that minimizes the AIC and the BIC. You may do so with `np.argmax`\n\n\nnp.argmin([m.aic(X) for m in models]) + 4\n\n\n# ## Generating images of hand-written digits\n# We just saw a an example of using GMM as a generative model of data in order to create new samples from the distribution defined by the input data. Here we will run with this idea and generate new handwritten digits from the MNIST dataset. Load the dataset with `load_digits`.\n\n\ndigits = load_digits()\n\n\n\n", "project_metadata": {"full_name": "pdejorge/BTS_Classical_Data_Analysis", "description": "Repository containing the different notebooks to be used during the Classical Data Analysis course in the Barcelona Technology School", "topics": [], "git_url": "git://github.com/pdejorge/BTS_Classical_Data_Analysis.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2019-02-07T20:02:28Z", "size": 75706, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 14445431}, "last_updated": "2019-10-31T23:46:29Z"}, "intent": "# Make a 10x10 plot of images"}, {"original_comment": " # avoid cutting off figure labels\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport matplotlib as mpl\nimport seaborn as sns\nimport sklearn\nimport yaml\nimport sys\nimport glob\nimport os\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ndef sum_cpu(node_res):\n \"\"\"Return sum of CPU resources allocated to all nodes\"\"\"\n cpu = sum([v['cpu'] for v in node_res])\n return cpu\n\n\ndef read_placement(placement, df_data, flow_dr=250):\n \"\"\"Read placement dict and write it to df_data. Then return.\"\"\"\n df_data['num_flows'].append(placement['input']['num_flows'])\n df_data['num_sources'].append(placement['input']['num_sources'])\n df_data['source_dr'].append(placement['input']['num_flows'] * flow_dr)\n df_data['num_instances'].append(placement['metrics']['num_instances'])\n df_data['max_e2e_delay'].append(placement['metrics']['max_endToEnd_delay'])\n df_data['total_delay'].append(placement['metrics']['total_delay'])\n df_data['runtime'].append(placement['metrics']['runtime'])\n df_data['total_cpu'].append(\n sum_cpu(placement['placement']['alloc_node_res']))\n return df_data\n\n\ndef read_results(results):\n \"\"\"Read result files matching the pattern and return df containing their metrics\"\"\"\n data = {'num_sources': [], 'num_flows': [], 'source_dr': [], 'num_instances': [],\n 'max_e2e_delay': [], 'total_delay': [], 'runtime': [], 'total_cpu': []}\n\n # iterate through result files\n for res in glob.glob(results):\n # open and save metrics of interest\n with open(res, 'r') as f:\n placement = yaml.load(f, Loader=yaml.SafeLoader)\n data = read_placement(placement, data)\n\n return pd.DataFrame(data).sort_values(by=['num_flows'])\n\n#%%\n\n# read results\ndataset = 'web_data'\nsources = 'three_source_dr250'\nresults = f'placement_data/{dataset}/{sources}/'\n\ndf_true = read_results(results + 'true/*.yaml')\ndf_fixed = read_results(results + 'fixed/*.yaml')\ndf_linear = read_results(results + 'linear/*.yaml')\ndf_boost = read_results(results + 'boosting/*.yaml')\ndf_svr = read_results(results + 'svr/*.yaml')\ndf_ml = read_results(results + 'ml/*.yaml')\n\n#%%\n\ndf_linear.head()\n\n#%%\n\ndef plot(x_col, x_label, y_col, y_label, save_plot=True, plot_fixed=True):\n sns.set(font_scale=1.3, style='white')\n fig, ax = plt.subplots()\n\n# plt.plot(df_true[x_col], df_true[y_col], label='True', color='black', marker='o')\n if plot_fixed:\n plt.plot(df_fixed[x_col], df_fixed[y_col],\n label='Fixed', color='green', marker='+')\n plt.plot(df_linear[x_col], df_linear[y_col],\n label='Linear', color='blue', marker='x')\n# plt.plot(df_boost[x_col], df_boost[y_col], label='Boosting', color='red', marker='^')\n# plt.plot(df_svr[x_col], df_svr[y_col], label='SVR', color='orange', marker='v')\n plt.plot(df_ml[x_col], df_ml[y_col],\n label='SVR+Boosting', color='red', marker='s')\n\n plt.xlabel(x_label)\n plt.ylabel(y_label)\n plt.legend()", "target_code": " plt.tight_layout()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport matplotlib as mpl\nimport seaborn as sns\nimport sklearn\nimport yaml\nimport sys\nimport glob\nimport os\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ndef sum_cpu(node_res):\n \"\"\"Return sum of CPU resources allocated to all nodes\"\"\"\n cpu = sum([v['cpu'] for v in node_res])\n return cpu\n\n\ndef read_placement(placement, df_data, flow_dr=250):\n \"\"\"Read placement dict and write it to df_data. Then return.\"\"\"\n df_data['num_flows'].append(placement['input']['num_flows'])\n df_data['num_sources'].append(placement['input']['num_sources'])\n df_data['source_dr'].append(placement['input']['num_flows'] * flow_dr)\n df_data['num_instances'].append(placement['metrics']['num_instances'])\n df_data['max_e2e_delay'].append(placement['metrics']['max_endToEnd_delay'])\n df_data['total_delay'].append(placement['metrics']['total_delay'])\n df_data['runtime'].append(placement['metrics']['runtime'])\n df_data['total_cpu'].append(\n sum_cpu(placement['placement']['alloc_node_res']))\n return df_data\n\n\ndef read_results(results):\n \"\"\"Read result files matching the pattern and return df containing their metrics\"\"\"\n data = {'num_sources': [], 'num_flows': [], 'source_dr': [], 'num_instances': [],\n 'max_e2e_delay': [], 'total_delay': [], 'runtime': [], 'total_cpu': []}\n\n # iterate through result files\n for res in glob.glob(results):\n # open and save metrics of interest\n with open(res, 'r') as f:\n placement = yaml.load(f, Loader=yaml.SafeLoader)\n data = read_placement(placement, data)\n\n return pd.DataFrame(data).sort_values(by=['num_flows'])\n\n\n# read results\ndataset = 'web_data'\nsources = 'three_source_dr250'\nresults = f'placement_data/{dataset}/{sources}/'\n\ndf_true = read_results(results + 'true/*.yaml')\ndf_fixed = read_results(results + 'fixed/*.yaml')\ndf_linear = read_results(results + 'linear/*.yaml')\ndf_boost = read_results(results + 'boosting/*.yaml')\ndf_svr = read_results(results + 'svr/*.yaml')\ndf_ml = read_results(results + 'ml/*.yaml')\n\n\ndf_linear.head()\n\n\ndef plot(x_col, x_label, y_col, y_label, save_plot=True, plot_fixed=True):\n sns.set(font_scale=1.3, style='white')\n fig, ax = plt.subplots()\n\n# plt.plot(df_true[x_col], df_true[y_col], label='True', color='black', marker='o')\n if plot_fixed:\n plt.plot(df_fixed[x_col], df_fixed[y_col],\n label='Fixed', color='green', marker='+')\n plt.plot(df_linear[x_col], df_linear[y_col],\n label='Linear', color='blue', marker='x')\n# plt.plot(df_boost[x_col], df_boost[y_col], label='Boosting', color='red', marker='^')\n# plt.plot(df_svr[x_col], df_svr[y_col], label='SVR', color='orange', marker='v')\n plt.plot(df_ml[x_col], df_ml[y_col],\n label='SVR+Boosting', color='red', marker='s')\n\n plt.xlabel(x_label)\n plt.ylabel(y_label)\n plt.legend()\n", "project_metadata": {"full_name": "CN-UPB/ml-for-resource-allocation", "description": "Machine Learning for Dynamic Resource Allocation in Network Function Virtualization", "topics": ["machine-learning", "resource-allocation", "prediction", "network-functions", "profiling", "vnf-placement", "benchmarking"], "git_url": "git://github.com/CN-UPB/ml-for-resource-allocation.git", "stars": 7, "watchers": 7, "forks": 2, "created": "2019-11-22T17:46:10Z", "size": 9819, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 732569}, "last_updated": "2020-07-28T05:44:36Z"}, "intent": " # avoid cutting off figure labels"}, {"original_comment": "# Convert to One Hot Encoded Values\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Self-Driving Car Engineer Nanodegree\n#\n# ## Deep Learning\n#\n# ## Project: Build a Traffic Sign Recognition Classifier\n#\n# In this notebook, a template is provided for you to implement your functionality in stages, which is required to successfully complete this project. If additional code is required that cannot be included in the notebook, be sure that the Python code is successfully imported and included in your submission if necessary.\n#\n# > **Note**: Once you have completed all of the code implementations, you need to finalize your work by exporting the iPython Notebook as an HTML document. Before exporting the notebook to html, all of the code cells need to have been run so that reviewers can see the final implementation and output. You can then export the notebook by using the menu above and navigating to \\n\",\n# \"**File -> Download as -> HTML (.html)**. Include the finished document along with this notebook as your submission.\n#\n# In addition to implementing code, there is a writeup to complete. The writeup should be completed in a separate file, which can be either a markdown file or a pdf document. There is a [write up template](https://github.com/udacity/CarND-Traffic-Sign-Classifier-Project/blob/master/writeup_template.md) that can be used to guide the writing process. Completing the code template and writeup template will cover all of the [rubric points](https://review.udacity.com/#!/rubrics/481/view) for this project.\n#\n# The [rubric](https://review.udacity.com/#!/rubrics/481/view) contains \"Stand Out Suggestions\" for enhancing the project beyond the minimum requirements. The stand out suggestions are optional. If you decide to pursue the \"stand out suggestions\", you can include the code in this Ipython notebook and also discuss the results in the writeup file.\n#\n#\n# >**Note:** Code and Markdown cells can be executed using the **Shift + Enter** keyboard shortcut. In addition, Markdown cells can be edited by typically double-clicking the cell to enter edit mode.\n\n# ---\n# ## Step 0: Load The Data\n\n#%%\n\n# Import Libraries\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nfrom sklearn import preprocessing\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import confusion_matrix\nfrom pylab import rcParams\nimport tensorflow as tf\nimport pickle\nimport time\nimport os\nimport cv2\nimport csv\nfrom PIL import Image\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# Load pickled data\n\ntraining_file = \"train.p\"\nvalidation_file = \"valid.p\"\ntesting_file = \"test.p\"\n\nwith open(training_file, mode='rb') as f:\n train = pickle.load(f)\nwith open(validation_file, mode='rb') as f:\n valid = pickle.load(f)\nwith open(testing_file, mode='rb') as f:\n test = pickle.load(f)\n\nX_train, y_train = train['features'], train['labels']\nX_valid, y_valid = valid['features'], valid['labels']\nX_test, y_test = test['features'], test['labels']\n\n\n# ---\n#\n# ## Step 1: Dataset Summary & Exploration\n#\n# The pickled data is a dictionary with 4 key/value pairs:\n#\n# - `'features'` is a 4D array containing raw pixel data of the traffic sign images, (num examples, width, height, channels).\n# - `'labels'` is a 1D array containing the label/class id of the traffic sign. The file `signnames.csv` contains id -> name mappings for each id.\n# - `'sizes'` is a list containing tuples, (width, height) representing the the original width and height the image.\n# - `'coords'` is a list containing tuples, (x1, y1, x2, y2) representing coordinates of a bounding box around the sign in the image. **THESE COORDINATES ASSUME THE ORIGINAL IMAGE. THE PICKLED DATA CONTAINS RESIZED VERSIONS (32 by 32) OF THESE IMAGES**\n#\n# Complete the basic data summary below. Use python, numpy and/or pandas methods to calculate the data summary rather than hard coding the results. For example, the [pandas shape method](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shape.html) might be useful for calculating some of the summary results.\n\n# ### Provide a Basic Summary of the Data Set Using Python, Numpy and/or Pandas\n\n#%%\n\n# Basic data summary.\n\n# Number of training examples\nn_train = len(X_train)\n\n# Number of testing examples\nn_test = len(X_test)\n\n# What's the shape of an image?\nimage_shape = X_train[0].shape\n\n# How many classes are in the dataset\nn_classes = len(np.unique(y_train))\n\nprint(\"Number of training examples =\", n_train)\nprint(\"Number of testing examples =\", n_test)\nprint(\"Image data shape =\", image_shape)\nprint(\"Number of classes =\", n_classes)\n\n\n# ### Include an exploratory visualization of the dataset\n\n# Visualize the German Traffic Signs Dataset using the pickled file(s). This is open ended, suggestions include: plotting traffic sign images, plotting the count of each sign, etc.\n#\n# The [Matplotlib](http://matplotlib.org/) [examples](http://matplotlib.org/examples/index.html) and [gallery](http://matplotlib.org/gallery.html) pages are a great resource for doing visualizations in Python.\n#\n# **NOTE:** It's recommended you start with something simple first. If you wish to do more, come back to it after you've completed the rest of the sections.\n\n#%%\n\n# Data exploration visualization\nfig = plt.figure(figsize=(15, 5))\n\nimage_seq = np.random.randint(1, len(X_train), 10)\n\n# Load image labels from csv\nlabel_csv = csv.reader(open('signnames.csv', 'r'))\nlabel_names = []\nfor row in label_csv:\n label_names.append(row[1])\nlabel_names.pop(0)\n\nfor ind, val in enumerate(image_seq):\n img = fig.add_subplot(2, 5, ind+1)\n plt.imshow(X_train[val-1])\n # Add corresponding label\n img.set_xlabel(\"{0} ({1})\".format(\n y_train[val-1], label_names[y_train[val-1]]))\n # Remove the axis ticks\n img.set_xticks([])\n img.set_yticks([])\n\nplt.show()\n\n#%%\n\n# Plot dataset distribution\nunique, counts = np.unique(y_train, return_counts=True)\nfig = plt.figure(figsize=(15, 5))\nplt.bar(unique, counts)\n\nlabel = [label for label in label_names]\n\nplt.xticks(np.arange(0.5, n_classes+0.5), label, rotation=45, ha='right')\n\nplt.ylabel('Frequency')\nplt.title('Training Data Distribution')\nplt.show()\n\n\n# ----\n#\n# ## Step 2: Design and Test a Model Architecture\n#\n# Design and implement a deep learning model that learns to recognize traffic signs. Train and test your model on the [German Traffic Sign Dataset](http://benchmark.ini.rub.de/?section=gtsrb&subsection=dataset).\n#\n# There are various aspects to consider when thinking about this problem:\n#\n# - Neural network architecture\n# - Play around preprocessing techniques (normalization, rgb to grayscale, etc)\n# - Number of examples per label (some have more than others).\n# - Generate fake data.\n#\n# Here is an example of a [published baseline model on this problem](http://yann.lecun.com/exdb/publis/pdf/sermanet-ijcnn-11.pdf). It's not required to be familiar with the approach used in the paper but, it's good practice to try to read papers like these.\n#\n# **NOTE:** The LeNet-5 implementation shown in the [classroom](https://classroom.udacity.com/nanodegrees/nd013/parts/fbf77062-5703-404e-b60c-95b78b2f3f9e/modules/6df7ae49-c61c-4bb2-a23e-6527e69209ec/lessons/601ae704-1035-4287-8b11-e2c2716217ad/concepts/d4aca031-508f-4e0b-b493-e7b706120f81) at the end of the CNN lesson is a solid starting point. You'll have to change the number of classes and possibly the preprocessing, but aside from that it's plug and play!\n\n# ### Pre-process the Data Set (normalization, grayscale, etc.)\n\n# Use the code cell (or multiple code cells, if necessary) to implement the first step of your project.\n\n#%%\n\n# Preprocess the data here. Preprocessing steps could include normalization, converting to grayscale, etc.\n'''\n# Grayscale conversion\n\n#Weighted average approach utilizing numpy functions. No need for OpenCV\n#Y' = 0.299 R + 0.587 G + 0.114 B \n\nX_train_gray = np.dot(X_train[...][...,:3],[0.299,0.587,0.114])\nX_test_gray = np.dot(X_test[...][...,:3],[0.299,0.587,0.114])\n'''\n\n# Variable names defined for convenience\nX_train_gray = X_train\nX_valid_gray = X_valid\nX_test_gray = X_test\n\n# Normalize data (Zero-Mean)\nX_train_gray_norm = (X_train_gray - np.mean(X_train_gray))/np.std(X_train_gray)\nX_valid_gray_norm = (X_valid_gray - np.mean(X_valid_gray))/np.std(X_valid_gray)\nX_test_gray_norm = (X_test_gray - np.mean(X_test_gray))/np.std(X_test_gray)\n\n#%%\n\n# One-Hot Encoding\n'''\nSince we already have the labels we can use Label Binarizer.\n'''\nencoder = preprocessing.LabelBinarizer()\ny_train_oh = encoder.fit_transform(y_train)\ny_valid_oh = encoder.fit_transform(y_valid)\ny_test_oh = encoder.fit_transform(y_test)\n\n#%%\n\n# Shuffle data\n\ntrain_features, train_labels = shuffle(X_train_gray_norm, y_train_oh)\nvalid_features, valid_labels = shuffle(X_valid_gray_norm, y_valid_oh)\n\n#%%\n\n# Flatten/Reshape data\nimage_size = len(train_features[0])\nn_input = image_size**2\nnum_channels = 3\n\ntrain_features_f = np.reshape(\n train_features, [-1, image_size, image_size, num_channels])\nvalid_features_f = np.reshape(\n valid_features, [-1, image_size, image_size, num_channels])\nX_test_gray_flat = np.reshape(\n X_test_gray_norm, [-1, image_size, image_size, num_channels])\n\n\n# ### Model Architecture\n\n#%%\n\n# Helper Functions\ndef calc_weights(shape):\n return tf.Variable(tf.truncated_normal(shape, stddev=0.01))\n\n\ndef calc_biases(length):\n return tf.Variable(tf.zeros(length))\n\n\ndef maxpool(ip, stride_len=2):\n filter_size = [1, stride_len, stride_len, 1]\n return tf.nn.max_pool(ip, ksize=filter_size, strides=[1, stride_len, stride_len, 1], padding='VALID')\n\n\ndef dropout(layer, dropout_prob):\n return tf.nn.dropout(layer, dropout_prob)\n\n\ndef conv_layer(ip, weights, biases, stride_len=1):\n '''\n input_layer_width -> Number of inputs from previous layer\n output_layer_width -> width of layer\n '''\n c_layer = tf.nn.conv2d(ip, weights, strides=[\n 1, stride_len, stride_len, 1], padding='VALID')\n c_layer = tf.nn.bias_add(c_layer, biases)\n\n c_layer = maxpool(c_layer, 2) # stride length for max_pool = 2\n\n c_layer = tf.nn.relu(c_layer)\n return c_layer\n\n\ndef reshape_layer(ip):\n '''\n Flatten or reshape conv_layer for input to fc_layer\n '''\n flat_layer = tf.reshape(ip, [-1, ip.get_shape()[1:4].num_elements()])\n\n #print(\"Flat Layer: %s\" %flat_layer.get_shape())\n\n return flat_layer\n\n\ndef fully_conn_layer(ip, weights, biases):\n '''\n input_layer_width -> Number of inputs from previous flattened layer\n output_layer_width -> width of layer, or equal to number of classes for prediction layer\n '''\n flat_input = reshape_layer(ip)\n fc_layer = tf.add(tf.matmul(flat_input, weights), biases)\n\n #print(\"FC Layer: %s\" %fc_layer.get_shape())\n\n # Not applying ReLU here because this could be prediction layer\n return fc_layer\n\n\ndef plot_loss_accuracy(batches, loss_batch, train_acc_batch, valid_acc_batch):\n loss_plot = plt.subplot(211)\n loss_plot.set_title('Loss')\n loss_plot.plot(batches, loss_batch, 'g')\n loss_plot.set_xlim([batches[0], batches[-1]])\n acc_plot = plt.subplot(212)\n acc_plot.set_title('Accuracy')\n acc_plot.plot(batches, train_acc_batch, 'r', label='Training Accuracy')\n acc_plot.plot(batches, valid_acc_batch, 'b', label='Validation Accuracy')\n acc_plot.set_ylim([0, 1.0])\n acc_plot.set_xlim([batches[0], batches[-1]])\n acc_plot.legend(loc=4)\n plt.tight_layout()\n plt.show()\n\n\ndef plot_confusion_matrix(cls_pred):\n test_cls = np.argmax(y_test_oh, axis=1)\n cm = confusion_matrix(y_true=test_cls,\n y_pred=cls_pred)\n\n # Print the confusion matrix as text.\n # print(cm)\n\n rcParams['figure.figsize'] = 13, 13\n\n # Plot the confusion matrix as an image.\n plt.matshow(cm)\n\n # Make various adjustments to the plot.\n plt.colorbar()\n tick_marks = np.arange(n_classes)\n plt.xticks(tick_marks, range(n_classes))\n plt.yticks(tick_marks, range(n_classes))\n plt.xlabel('Predicted')\n plt.ylabel('True')\n\n plt.show()\n\n\ndef print_test_accuracy():\n test_cls = np.argmax(y_test_oh, axis=1)\n cls_pred = np.zeros(shape=n_test, dtype=np.int)\n i = 0\n\n while i < n_test:\n j = min(i + batch_size, n_test)\n\n batch_x = X_test_gray_flat[i:j]\n batch_y = y_test_oh[i:j]\n\n feed_dict = {network_input: batch_x,\n network_output: batch_y, keep_prob: 1.0}\n\n cls_pred[i:j] = session.run(label_pred_class, feed_dict=feed_dict)\n\n i = j\n\n correct = (test_cls == cls_pred)\n correct_sum = correct.sum()\n\n acc = float(correct_sum) / n_test\n\n msg = \"Accuracy on Test-Set: {0:.1%} ({1} / {2})\"\n print(msg.format(acc, correct_sum, n_test))\n\n plot_confusion_matrix(cls_pred=cls_pred)\n\n#%%\n\n# Convnet model\ndef convnet_model(ip, weights_dict, biases_dict, dropout_prob):\n\n # First conv layer\n c_layer_1 = conv_layer(\n ip, weights_dict['layer_1'], biases_dict['layer_1'], stride_len=1)\n c_layer_1 = dropout(c_layer_1, dropout_prob)\n\n # Second conv layer\n c_layer_2 = conv_layer(\n c_layer_1, weights_dict['layer_2'], biases_dict['layer_2'], stride_len=1)\n c_layer_2 = dropout(c_layer_2, dropout_prob)\n\n # FC Layer\n fc_layer = fully_conn_layer(\n c_layer_2, weights_dict['fc_layer'], biases_dict['fc_layer'])\n\n fc_layer = tf.nn.relu(fc_layer)\n fc_layer = dropout(fc_layer, dropout_prob)\n\n # Output fc layer\n out = fully_conn_layer(fc_layer, weights_dict['out'], biases_dict['out'])\n\n return out\n\n#%%\n\n# Dimensions\nlayer_width = {\n 'layer_1': 32,\n 'layer_2': 64,\n 'fc_layer': 1024,\n 'out': n_classes\n} # number of filters\n\nfilter_size = {\n 'layer_1': 5,\n 'layer_2': 5\n} # conv filter size\n\n\n# Weights and biases\nweights_dict = {\n 'layer_1': calc_weights([filter_size['layer_1'], filter_size['layer_1'], num_channels, layer_width['layer_1']]),\n 'layer_2': calc_weights([filter_size['layer_2'], filter_size['layer_2'], layer_width['layer_1'], layer_width['layer_2']]),\n 'fc_layer': calc_weights([1600, layer_width['fc_layer']]),\n 'out': calc_weights([layer_width['fc_layer'], layer_width['out']])\n}\n\nbiases_dict = {\n 'layer_1': calc_biases([layer_width['layer_1']]),\n 'layer_2': calc_biases([layer_width['layer_2']]),\n 'fc_layer': calc_biases([layer_width['fc_layer']]),\n 'out': calc_biases([layer_width['out']])\n}\n\n#%%\n\n# Define Parameters\nlearning_rate = 0.001\ntraining_epochs = 20\nbatch_size = 64\ndisplay_step = 1\n\n#%%\n\n# Input graph for TF\nnetwork_input = tf.placeholder(\n \"float\", [None, image_size, image_size, num_channels])\nnetwork_output = tf.placeholder(\"float\", [None, n_classes])\nkeep_prob = tf.placeholder(tf.float32)\n\nlogits = convnet_model(network_input, weights_dict, biases_dict, keep_prob)\n\nlabel_pred = tf.nn.softmax(logits)\nlabel_pred_class = tf.argmax(label_pred, 1)\n\n# Cost function and Optimizer\ncost = tf.reduce_mean(\n tf.nn.softmax_cross_entropy_with_logits(logits, network_output))\noptimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)\n\n# Check if predictions are correct.\ncorrect_prediction = tf.equal(label_pred_class, tf.argmax(network_output, 1))\naccuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n\n\n# ### Train, Validate and Test the Model\n\n# A validation set can be used to assess how well the model is performing. A low accuracy on the training and validation\n# sets imply underfitting. A high accuracy on the test set but low accuracy on the validation set implies overfitting.\n\n#%%\n\n# Train your model here.\n# Calculate and report the accuracy on the training and validation set.\n# Once a final model architecture is selected,\n# the accuracy on the test set should be calculated and reported as well..\n\n# Initialize variables\ninit = tf.global_variables_initializer()\n#init = tf.initialize_all_variables()\n\nbatches = []\nloss_batch = []\ntrain_acc_batch = []\nvalid_acc_batch = []\n\ntotal_time = time.time()\n# Launch graph\nsession = tf.Session()\nsession.run(init)\n\n# Training cycle\nfor epochs in range(training_epochs):\n start_time = time.time()\n avg_cost = 0.\n total_batch = int(len(train_features)/batch_size)\n\n train_features_f, train_labels = shuffle(train_features_f, train_labels)\n valid_features_f, valid_labels = shuffle(valid_features_f, valid_labels)\n\n for i in range(total_batch):\n idx_l = i*batch_size\n idx_h = idx_l + batch_size\n\n batch_x = train_features_f[idx_l:idx_h]\n batch_y = train_labels[idx_l:idx_h]\n\n _, c = session.run([optimizer, cost], feed_dict={\n network_input: batch_x, network_output: batch_y, keep_prob: 0.5})\n\n # Compute average loss\n avg_cost += c / total_batch\n\n # Log every 50 batches\n if not i % 50:\n # Calculate Training and Validation accuracy\n training_accuracy = session.run(accuracy, feed_dict={network_input: train_features_f,\n network_output: train_labels, keep_prob: 0.5})\n validation_accuracy = session.run(accuracy, feed_dict={network_input: valid_features_f,\n network_output: valid_labels, keep_prob: 1})\n\n # Log batches\n previous_batch = batches[-1] if batches else 0\n batches.append(50 + previous_batch)\n loss_batch.append(c)\n train_acc_batch.append(training_accuracy)\n valid_acc_batch.append(validation_accuracy)\n\n # Display logs per epoch step\n if epochs % display_step == 0:\n print(\"Epoch:\", '%04d' % (epochs+1), \"cost=\",\n \"{:.9f}\".format(avg_cost))\n end_time = time.time() - start_time\n print(\"Time after epoch: %s\" % end_time)\n\n # Check accuracy against Validation data\n validation_accuracy = session.run(accuracy, feed_dict={network_input: valid_features_f,\n network_output: valid_labels, keep_prob: 1})\n\nprint(\"Optimization Finished!\")\n\ntest_accuracy = session.run(accuracy, feed_dict={\n network_input: X_test_gray_flat, network_output: y_test_oh, keep_prob: 1})\nprint(\"Training Accuracy:\", training_accuracy)\nprint(\"Validation Accuracy:\", validation_accuracy)\nprint(\"Test Accuracy:\", test_accuracy)\n\nplot_loss_accuracy(batches, loss_batch, train_acc_batch, valid_acc_batch)\n\nfinal_time = time.time() - total_time\nprint(\"Time taken: %s\" % final_time)\n\n#%%\n\n# Confusion Matrix\nprint_test_accuracy()\n\n\n# ---\n#\n# ## Step 3: Test a Model on New Images\n#\n# To give yourself more insight into how your model is working, download at least five pictures of German traffic signs from the web and use your model to predict the traffic sign type.\n#\n# You may find `signnames.csv` useful as it contains mappings from the class id (integer) to the actual sign name.\n\n# ### Load and Output the Images\n\n#%%\n\n# Load the images and plot them here.\n\nimages = os.listdir(\"test_images/\")\n\nfig = plt.figure(figsize=(12, 5))\n\ntest_images = []\n\n# Read in test images\nfor idx, val in enumerate(images):\n # reading in an image\n image = cv2.imread('test_images/' + images[idx])\n if len(image.shape) > 2 and image.shape[2] == 4:\n # convert the image from RGBA2RGB\n image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)\n else:\n image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n\n if image.shape[0] != 32:\n\n image = cv2.resize(image, (32, 32))\n test_images.append(image)\n\n\n# Create subplots in figure\nfor r in range(1, len(test_images)+1):\n img = fig.add_subplot(2, 4, r)\n plt.imshow(test_images[r-1])\n img.set_xticks([])\n img.set_yticks([])\n\nplt.show()\n\n#%%\n\n# Preprocess images\n# The images are already 32x32\n\n# Normalize data (Zero-Mean)\ntest_images_norm = (test_images - np.mean(test_images))/np.std(test_images)\n\n# Reshape\ntest_images_norm_f = np.reshape(\n test_images_norm, [-1, image_size, image_size, num_channels])\nprint(test_images_norm_f.shape)\n\n# Define True Labels\ny_new_true = np.array([27, 23, 13, 17, 2, 29, 1, 14])", "target_code": "y_new_true_one_hot = (np.arange(n_classes) ==\n y_new_true[:, None]).astype(np.float)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Self-Driving Car Engineer Nanodegree\n#\n# ## Deep Learning\n#\n# ## Project: Build a Traffic Sign Recognition Classifier\n#\n# In this notebook, a template is provided for you to implement your functionality in stages, which is required to successfully complete this project. If additional code is required that cannot be included in the notebook, be sure that the Python code is successfully imported and included in your submission if necessary.\n#\n# > **Note**: Once you have completed all of the code implementations, you need to finalize your work by exporting the iPython Notebook as an HTML document. Before exporting the notebook to html, all of the code cells need to have been run so that reviewers can see the final implementation and output. You can then export the notebook by using the menu above and navigating to \\n\",\n# \"**File -> Download as -> HTML (.html)**. Include the finished document along with this notebook as your submission.\n#\n# In addition to implementing code, there is a writeup to complete. The writeup should be completed in a separate file, which can be either a markdown file or a pdf document. There is a [write up template](https://github.com/udacity/CarND-Traffic-Sign-Classifier-Project/blob/master/writeup_template.md) that can be used to guide the writing process. Completing the code template and writeup template will cover all of the [rubric points](https://review.udacity.com/#!/rubrics/481/view) for this project.\n#\n# The [rubric](https://review.udacity.com/#!/rubrics/481/view) contains \"Stand Out Suggestions\" for enhancing the project beyond the minimum requirements. The stand out suggestions are optional. If you decide to pursue the \"stand out suggestions\", you can include the code in this Ipython notebook and also discuss the results in the writeup file.\n#\n#\n# >**Note:** Code and Markdown cells can be executed using the **Shift + Enter** keyboard shortcut. In addition, Markdown cells can be edited by typically double-clicking the cell to enter edit mode.\n\n# ---\n# ## Step 0: Load The Data\n\n\n# Import Libraries\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nfrom sklearn import preprocessing\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\nfrom sklearn.metrics import confusion_matrix\nfrom pylab import rcParams\nimport tensorflow as tf\nimport pickle\nimport time\nimport os\nimport cv2\nimport csv\nfrom PIL import Image\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Load pickled data\n\ntraining_file = \"train.p\"\nvalidation_file = \"valid.p\"\ntesting_file = \"test.p\"\n\nwith open(training_file, mode='rb') as f:\n train = pickle.load(f)\nwith open(validation_file, mode='rb') as f:\n valid = pickle.load(f)\nwith open(testing_file, mode='rb') as f:\n test = pickle.load(f)\n\nX_train, y_train = train['features'], train['labels']\nX_valid, y_valid = valid['features'], valid['labels']\nX_test, y_test = test['features'], test['labels']\n\n\n# ---\n#\n# ## Step 1: Dataset Summary & Exploration\n#\n# The pickled data is a dictionary with 4 key/value pairs:\n#\n# - `'features'` is a 4D array containing raw pixel data of the traffic sign images, (num examples, width, height, channels).\n# - `'labels'` is a 1D array containing the label/class id of the traffic sign. The file `signnames.csv` contains id -> name mappings for each id.\n# - `'sizes'` is a list containing tuples, (width, height) representing the the original width and height the image.\n# - `'coords'` is a list containing tuples, (x1, y1, x2, y2) representing coordinates of a bounding box around the sign in the image. **THESE COORDINATES ASSUME THE ORIGINAL IMAGE. THE PICKLED DATA CONTAINS RESIZED VERSIONS (32 by 32) OF THESE IMAGES**\n#\n# Complete the basic data summary below. Use python, numpy and/or pandas methods to calculate the data summary rather than hard coding the results. For example, the [pandas shape method](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shape.html) might be useful for calculating some of the summary results.\n\n# ### Provide a Basic Summary of the Data Set Using Python, Numpy and/or Pandas\n\n\n# Basic data summary.\n\n# Number of training examples\nn_train = len(X_train)\n\n# Number of testing examples\nn_test = len(X_test)\n\n# What's the shape of an image?\nimage_shape = X_train[0].shape\n\n# How many classes are in the dataset\nn_classes = len(np.unique(y_train))\n\nprint(\"Number of training examples =\", n_train)\nprint(\"Number of testing examples =\", n_test)\nprint(\"Image data shape =\", image_shape)\nprint(\"Number of classes =\", n_classes)\n\n\n# ### Include an exploratory visualization of the dataset\n\n# Visualize the German Traffic Signs Dataset using the pickled file(s). This is open ended, suggestions include: plotting traffic sign images, plotting the count of each sign, etc.\n#\n# The [Matplotlib](http://matplotlib.org/) [examples](http://matplotlib.org/examples/index.html) and [gallery](http://matplotlib.org/gallery.html) pages are a great resource for doing visualizations in Python.\n#\n# **NOTE:** It's recommended you start with something simple first. If you wish to do more, come back to it after you've completed the rest of the sections.\n\n\n# Data exploration visualization\nfig = plt.figure(figsize=(15, 5))\n\nimage_seq = np.random.randint(1, len(X_train), 10)\n\n# Load image labels from csv\nlabel_csv = csv.reader(open('signnames.csv', 'r'))\nlabel_names = []\nfor row in label_csv:\n label_names.append(row[1])\nlabel_names.pop(0)\n\nfor ind, val in enumerate(image_seq):\n img = fig.add_subplot(2, 5, ind+1)\n plt.imshow(X_train[val-1])\n # Add corresponding label\n img.set_xlabel(\"{0} ({1})\".format(\n y_train[val-1], label_names[y_train[val-1]]))\n # Remove the axis ticks\n img.set_xticks([])\n img.set_yticks([])\n\nplt.show()\n\n\n# Plot dataset distribution\nunique, counts = np.unique(y_train, return_counts=True)\nfig = plt.figure(figsize=(15, 5))\nplt.bar(unique, counts)\n\nlabel = [label for label in label_names]\n\nplt.xticks(np.arange(0.5, n_classes+0.5), label, rotation=45, ha='right')\n\nplt.ylabel('Frequency')\nplt.title('Training Data Distribution')\nplt.show()\n\n\n# ----\n#\n# ## Step 2: Design and Test a Model Architecture\n#\n# Design and implement a deep learning model that learns to recognize traffic signs. Train and test your model on the [German Traffic Sign Dataset](http://benchmark.ini.rub.de/?section=gtsrb&subsection=dataset).\n#\n# There are various aspects to consider when thinking about this problem:\n#\n# - Neural network architecture\n# - Play around preprocessing techniques (normalization, rgb to grayscale, etc)\n# - Number of examples per label (some have more than others).\n# - Generate fake data.\n#\n# Here is an example of a [published baseline model on this problem](http://yann.lecun.com/exdb/publis/pdf/sermanet-ijcnn-11.pdf). It's not required to be familiar with the approach used in the paper but, it's good practice to try to read papers like these.\n#\n# **NOTE:** The LeNet-5 implementation shown in the [classroom](https://classroom.udacity.com/nanodegrees/nd013/parts/fbf77062-5703-404e-b60c-95b78b2f3f9e/modules/6df7ae49-c61c-4bb2-a23e-6527e69209ec/lessons/601ae704-1035-4287-8b11-e2c2716217ad/concepts/d4aca031-508f-4e0b-b493-e7b706120f81) at the end of the CNN lesson is a solid starting point. You'll have to change the number of classes and possibly the preprocessing, but aside from that it's plug and play!\n\n# ### Pre-process the Data Set (normalization, grayscale, etc.)\n\n# Use the code cell (or multiple code cells, if necessary) to implement the first step of your project.\n\n\n# Preprocess the data here. Preprocessing steps could include normalization, converting to grayscale, etc.\n'''\n# Grayscale conversion\n\n#Weighted average approach utilizing numpy functions. No need for OpenCV\n#Y' = 0.299 R + 0.587 G + 0.114 B \n\nX_train_gray = np.dot(X_train[...][...,:3],[0.299,0.587,0.114])\nX_test_gray = np.dot(X_test[...][...,:3],[0.299,0.587,0.114])\n'''\n\n# Variable names defined for convenience\nX_train_gray = X_train\nX_valid_gray = X_valid\nX_test_gray = X_test\n\n# Normalize data (Zero-Mean)\nX_train_gray_norm = (X_train_gray - np.mean(X_train_gray))/np.std(X_train_gray)\nX_valid_gray_norm = (X_valid_gray - np.mean(X_valid_gray))/np.std(X_valid_gray)\nX_test_gray_norm = (X_test_gray - np.mean(X_test_gray))/np.std(X_test_gray)\n\n\n# One-Hot Encoding\n'''\nSince we already have the labels we can use Label Binarizer.\n'''\nencoder = preprocessing.LabelBinarizer()\ny_train_oh = encoder.fit_transform(y_train)\ny_valid_oh = encoder.fit_transform(y_valid)\ny_test_oh = encoder.fit_transform(y_test)\n\n\n# Shuffle data\n\ntrain_features, train_labels = shuffle(X_train_gray_norm, y_train_oh)\nvalid_features, valid_labels = shuffle(X_valid_gray_norm, y_valid_oh)\n\n\n# Flatten/Reshape data\nimage_size = len(train_features[0])\nn_input = image_size**2\nnum_channels = 3\n\ntrain_features_f = np.reshape(\n train_features, [-1, image_size, image_size, num_channels])\nvalid_features_f = np.reshape(\n valid_features, [-1, image_size, image_size, num_channels])\nX_test_gray_flat = np.reshape(\n X_test_gray_norm, [-1, image_size, image_size, num_channels])\n\n\n# ### Model Architecture\n\n\n# Helper Functions\ndef calc_weights(shape):\n return tf.Variable(tf.truncated_normal(shape, stddev=0.01))\n\n\ndef calc_biases(length):\n return tf.Variable(tf.zeros(length))\n\n\ndef maxpool(ip, stride_len=2):\n filter_size = [1, stride_len, stride_len, 1]\n return tf.nn.max_pool(ip, ksize=filter_size, strides=[1, stride_len, stride_len, 1], padding='VALID')\n\n\ndef dropout(layer, dropout_prob):\n return tf.nn.dropout(layer, dropout_prob)\n\n\ndef conv_layer(ip, weights, biases, stride_len=1):\n '''\n input_layer_width -> Number of inputs from previous layer\n output_layer_width -> width of layer\n '''\n c_layer = tf.nn.conv2d(ip, weights, strides=[\n 1, stride_len, stride_len, 1], padding='VALID')\n c_layer = tf.nn.bias_add(c_layer, biases)\n\n c_layer = maxpool(c_layer, 2) # stride length for max_pool = 2\n\n c_layer = tf.nn.relu(c_layer)\n return c_layer\n\n\ndef reshape_layer(ip):\n '''\n Flatten or reshape conv_layer for input to fc_layer\n '''\n flat_layer = tf.reshape(ip, [-1, ip.get_shape()[1:4].num_elements()])\n\n #print(\"Flat Layer: %s\" %flat_layer.get_shape())\n\n return flat_layer\n\n\ndef fully_conn_layer(ip, weights, biases):\n '''\n input_layer_width -> Number of inputs from previous flattened layer\n output_layer_width -> width of layer, or equal to number of classes for prediction layer\n '''\n flat_input = reshape_layer(ip)\n fc_layer = tf.add(tf.matmul(flat_input, weights), biases)\n\n #print(\"FC Layer: %s\" %fc_layer.get_shape())\n\n # Not applying ReLU here because this could be prediction layer\n return fc_layer\n\n\ndef plot_loss_accuracy(batches, loss_batch, train_acc_batch, valid_acc_batch):\n loss_plot = plt.subplot(211)\n loss_plot.set_title('Loss')\n loss_plot.plot(batches, loss_batch, 'g')\n loss_plot.set_xlim([batches[0], batches[-1]])\n acc_plot = plt.subplot(212)\n acc_plot.set_title('Accuracy')\n acc_plot.plot(batches, train_acc_batch, 'r', label='Training Accuracy')\n acc_plot.plot(batches, valid_acc_batch, 'b', label='Validation Accuracy')\n acc_plot.set_ylim([0, 1.0])\n acc_plot.set_xlim([batches[0], batches[-1]])\n acc_plot.legend(loc=4)\n plt.tight_layout()\n plt.show()\n\n\ndef plot_confusion_matrix(cls_pred):\n test_cls = np.argmax(y_test_oh, axis=1)\n cm = confusion_matrix(y_true=test_cls,\n y_pred=cls_pred)\n\n # Print the confusion matrix as text.\n # print(cm)\n\n rcParams['figure.figsize'] = 13, 13\n\n # Plot the confusion matrix as an image.\n plt.matshow(cm)\n\n # Make various adjustments to the plot.\n plt.colorbar()\n tick_marks = np.arange(n_classes)\n plt.xticks(tick_marks, range(n_classes))\n plt.yticks(tick_marks, range(n_classes))\n plt.xlabel('Predicted')\n plt.ylabel('True')\n\n plt.show()\n\n\ndef print_test_accuracy():\n test_cls = np.argmax(y_test_oh, axis=1)\n cls_pred = np.zeros(shape=n_test, dtype=np.int)\n i = 0\n\n while i < n_test:\n j = min(i + batch_size, n_test)\n\n batch_x = X_test_gray_flat[i:j]\n batch_y = y_test_oh[i:j]\n\n feed_dict = {network_input: batch_x,\n network_output: batch_y, keep_prob: 1.0}\n\n cls_pred[i:j] = session.run(label_pred_class, feed_dict=feed_dict)\n\n i = j\n\n correct = (test_cls == cls_pred)\n correct_sum = correct.sum()\n\n acc = float(correct_sum) / n_test\n\n msg = \"Accuracy on Test-Set: {0:.1%} ({1} / {2})\"\n print(msg.format(acc, correct_sum, n_test))\n\n plot_confusion_matrix(cls_pred=cls_pred)\n\n\n# Convnet model\ndef convnet_model(ip, weights_dict, biases_dict, dropout_prob):\n\n # First conv layer\n c_layer_1 = conv_layer(\n ip, weights_dict['layer_1'], biases_dict['layer_1'], stride_len=1)\n c_layer_1 = dropout(c_layer_1, dropout_prob)\n\n # Second conv layer\n c_layer_2 = conv_layer(\n c_layer_1, weights_dict['layer_2'], biases_dict['layer_2'], stride_len=1)\n c_layer_2 = dropout(c_layer_2, dropout_prob)\n\n # FC Layer\n fc_layer = fully_conn_layer(\n c_layer_2, weights_dict['fc_layer'], biases_dict['fc_layer'])\n\n fc_layer = tf.nn.relu(fc_layer)\n fc_layer = dropout(fc_layer, dropout_prob)\n\n # Output fc layer\n out = fully_conn_layer(fc_layer, weights_dict['out'], biases_dict['out'])\n\n return out\n\n\n# Dimensions\nlayer_width = {\n 'layer_1': 32,\n 'layer_2': 64,\n 'fc_layer': 1024,\n 'out': n_classes\n} # number of filters\n\nfilter_size = {\n 'layer_1': 5,\n 'layer_2': 5\n} # conv filter size\n\n\n# Weights and biases\nweights_dict = {\n 'layer_1': calc_weights([filter_size['layer_1'], filter_size['layer_1'], num_channels, layer_width['layer_1']]),\n 'layer_2': calc_weights([filter_size['layer_2'], filter_size['layer_2'], layer_width['layer_1'], layer_width['layer_2']]),\n 'fc_layer': calc_weights([1600, layer_width['fc_layer']]),\n 'out': calc_weights([layer_width['fc_layer'], layer_width['out']])\n}\n\nbiases_dict = {\n 'layer_1': calc_biases([layer_width['layer_1']]),\n 'layer_2': calc_biases([layer_width['layer_2']]),\n 'fc_layer': calc_biases([layer_width['fc_layer']]),\n 'out': calc_biases([layer_width['out']])\n}\n\n\n# Define Parameters\nlearning_rate = 0.001\ntraining_epochs = 20\nbatch_size = 64\ndisplay_step = 1\n\n\n# Input graph for TF\nnetwork_input = tf.placeholder(\n \"float\", [None, image_size, image_size, num_channels])\nnetwork_output = tf.placeholder(\"float\", [None, n_classes])\nkeep_prob = tf.placeholder(tf.float32)\n\nlogits = convnet_model(network_input, weights_dict, biases_dict, keep_prob)\n\nlabel_pred = tf.nn.softmax(logits)\nlabel_pred_class = tf.argmax(label_pred, 1)\n\n# Cost function and Optimizer\ncost = tf.reduce_mean(\n tf.nn.softmax_cross_entropy_with_logits(logits, network_output))\noptimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)\n\n# Check if predictions are correct.\ncorrect_prediction = tf.equal(label_pred_class, tf.argmax(network_output, 1))\naccuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n\n\n# ### Train, Validate and Test the Model\n\n# A validation set can be used to assess how well the model is performing. A low accuracy on the training and validation\n# sets imply underfitting. A high accuracy on the test set but low accuracy on the validation set implies overfitting.\n\n\n# Train your model here.\n# Calculate and report the accuracy on the training and validation set.\n# Once a final model architecture is selected,\n# the accuracy on the test set should be calculated and reported as well..\n\n# Initialize variables\ninit = tf.global_variables_initializer()\n#init = tf.initialize_all_variables()\n\nbatches = []\nloss_batch = []\ntrain_acc_batch = []\nvalid_acc_batch = []\n\ntotal_time = time.time()\n# Launch graph\nsession = tf.Session()\nsession.run(init)\n\n# Training cycle\nfor epochs in range(training_epochs):\n start_time = time.time()\n avg_cost = 0.\n total_batch = int(len(train_features)/batch_size)\n\n train_features_f, train_labels = shuffle(train_features_f, train_labels)\n valid_features_f, valid_labels = shuffle(valid_features_f, valid_labels)\n\n for i in range(total_batch):\n idx_l = i*batch_size\n idx_h = idx_l + batch_size\n\n batch_x = train_features_f[idx_l:idx_h]\n batch_y = train_labels[idx_l:idx_h]\n\n _, c = session.run([optimizer, cost], feed_dict={\n network_input: batch_x, network_output: batch_y, keep_prob: 0.5})\n\n # Compute average loss\n avg_cost += c / total_batch\n\n # Log every 50 batches\n if not i % 50:\n # Calculate Training and Validation accuracy\n training_accuracy = session.run(accuracy, feed_dict={network_input: train_features_f,\n network_output: train_labels, keep_prob: 0.5})\n validation_accuracy = session.run(accuracy, feed_dict={network_input: valid_features_f,\n network_output: valid_labels, keep_prob: 1})\n\n # Log batches\n previous_batch = batches[-1] if batches else 0\n batches.append(50 + previous_batch)\n loss_batch.append(c)\n train_acc_batch.append(training_accuracy)\n valid_acc_batch.append(validation_accuracy)\n\n # Display logs per epoch step\n if epochs % display_step == 0:\n print(\"Epoch:\", '%04d' % (epochs+1), \"cost=\",\n \"{:.9f}\".format(avg_cost))\n end_time = time.time() - start_time\n print(\"Time after epoch: %s\" % end_time)\n\n # Check accuracy against Validation data\n validation_accuracy = session.run(accuracy, feed_dict={network_input: valid_features_f,\n network_output: valid_labels, keep_prob: 1})\n\nprint(\"Optimization Finished!\")\n\ntest_accuracy = session.run(accuracy, feed_dict={\n network_input: X_test_gray_flat, network_output: y_test_oh, keep_prob: 1})\nprint(\"Training Accuracy:\", training_accuracy)\nprint(\"Validation Accuracy:\", validation_accuracy)\nprint(\"Test Accuracy:\", test_accuracy)\n\nplot_loss_accuracy(batches, loss_batch, train_acc_batch, valid_acc_batch)\n\nfinal_time = time.time() - total_time\nprint(\"Time taken: %s\" % final_time)\n\n\n# Confusion Matrix\nprint_test_accuracy()\n\n\n# ---\n#\n# ## Step 3: Test a Model on New Images\n#\n# To give yourself more insight into how your model is working, download at least five pictures of German traffic signs from the web and use your model to predict the traffic sign type.\n#\n# You may find `signnames.csv` useful as it contains mappings from the class id (integer) to the actual sign name.\n\n# ### Load and Output the Images\n\n\n# Load the images and plot them here.\n\nimages = os.listdir(\"test_images/\")\n\nfig = plt.figure(figsize=(12, 5))\n\ntest_images = []\n\n# Read in test images\nfor idx, val in enumerate(images):\n # reading in an image\n image = cv2.imread('test_images/' + images[idx])\n if len(image.shape) > 2 and image.shape[2] == 4:\n # convert the image from RGBA2RGB\n image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)\n else:\n image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n\n if image.shape[0] != 32:\n\n image = cv2.resize(image, (32, 32))\n test_images.append(image)\n\n\n# Create subplots in figure\nfor r in range(1, len(test_images)+1):\n img = fig.add_subplot(2, 4, r)\n plt.imshow(test_images[r-1])\n img.set_xticks([])\n img.set_yticks([])\n\nplt.show()\n\n\n# Preprocess images\n# The images are already 32x32\n\n# Normalize data (Zero-Mean)\ntest_images_norm = (test_images - np.mean(test_images))/np.std(test_images)\n\n# Reshape\ntest_images_norm_f = np.reshape(\n test_images_norm, [-1, image_size, image_size, num_channels])\nprint(test_images_norm_f.shape)\n\n# Define True Labels\ny_new_true = np.array([27, 23, 13, 17, 2, 29, 1, 14])\n", "project_metadata": {"full_name": "sahiljuneja/Udacity-SDCND-Term-1", "description": "Udacity's Self-Driving Car Nanodegree Term 1 - Computer Vision and Deep Learning", "topics": [], "git_url": "git://github.com/sahiljuneja/Udacity-SDCND-Term-1.git", "stars": 8, "watchers": 8, "forks": 8, "created": "2016-10-29T07:42:43Z", "size": 186793, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 28135431, "HTML": 688344, "Python": 65834, "Shell": 2892}, "last_updated": "2020-02-13T18:51:09Z"}, "intent": "# Convert to One Hot Encoded Values"}, {"original_comment": "# plot histogram of X_2 on the right\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Working with Data\n#\n# One of the most important aspects of designing any machine learning system is to __understand your data__. In this notebook, we're going to load and visualize some very basic datasets so that you understand some of the various tools available to you in Python for working with data. You should __always try to visualize your data__ before you use it in any type of algorithm. First we will look at a random dataset, and then we'll use `sklearn` to study the Iris dataset and other famous datasets.\n#\n# _Note: some code segments have TODO comments in them. These comments are optional exercises for you to modify the code in a useful way, however they are not meant to be restrictive. Feel free to modify the code in this notebook any way you like; it's a great way to practice your coding skills._\n#\n# ## Getting Started\n#\n# You should have your own Anaconda virtual environment with all of the necessary Python modules installed. You can check by trying to import them:\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport os\nimport pandas as pd\nimport seaborn as sns\nimport skimage.io\nimport sklearn\nimport sklearn.datasets\n\n\n# ## Generating Random Data\n#\n# Let's begin by generating a random dataset of 2-D data points.\n\n#%%\n\n# you can set a random seed to get \"reproducible\" randomness\nnp.random.seed(912)\n\n# define a function to generate random data\n\n\ndef random_data(n_samples):\n \"\"\"\n Generates random data in two dimensions.\n Returns a (n_samples, 2) numpy array.\n \"\"\"\n # define the parameters (mean, covariance) of a normal distribution\n mean = [0.5, 0.5]\n cov = [[1, 0], [0, 100]]\n\n # generate n_samples of 2D data\n X = np.random.multivariate_normal(mean, cov, size=n_samples)\n\n return X\n\n\n# Let's see what this data looks like. We'll start small and draw 50 samples from our function.\n\n#%%\n\nX = random_data(50)\nprint(\"Dataset shape: (%d, %d)\" % X.shape)\nprint(X)\n\n\n# Awesome! Now we have some data to play with.\n#\n# ## Plotting Univariate Data\n#\n# Let's try to generate some more samples and then plot them.\n\n#%%\n\n# generate 1000 random samples\nX = random_data(1000)\n\n\n# One of the most basic aspects of a dataset is its __dimensionality__. For example, if a dataset is 2-dimensional, it means that the dataset has two variables that are influencing the outcomes. These dimensions are also called the __features__ of a dataset. In this case, `X` is our dataset, and it has two features (we'll call them \"X_1\" and \"X_2\"). Let's plot each of these features individually using `matplotlib`:\n\n#%%\n\n# generate indices for X\nidx = range(len(X))\n\n# initialize a 2x1 figure for plotting\n_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n\n# plot X_1 on the left\nax1.scatter(idx, X[:, 0], c=\"b\")\nax1.set_title(\"X_1\")\nax1.set_xlabel(\"Index\")\nax1.set_ylabel(\"Value\")\n\n# plot X_2 on the right\nax2.scatter(idx, X[:, 1], c=\"r\")\nax2.set_title(\"X_2\")\nax2.set_xlabel(\"Index\")\nax2.set_ylabel(\"Value\")\n\n# display the plot\nplt.show()\n\n\n# Here we've simply plotted all of the samples in a line, separately for each feature. This visualization is fairly straightforward, but it's not how we typically look at data of this type. That is, since this dataset consists of many independent samples, a better way to visualize this data is to use a __histogram__. The `matplotlib` library has a histogram function called `plt.hist()`, but here we're going to use another library called `seaborn`, which provides some fancier plotting functions on top of `matplotlib`.\n\n#%%\n\n# initialize a 2x1 figure for plotting\n_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharex=True)\n\n# plot histogram of X_1 on the left\nsns.distplot(X[:, 0], kde=True, color=\"b\", ax=ax1)\nax1.set_title(\"X_1\")\nax1.set_xlabel(\"Value\")\nax1.set_ylabel(\"Frequency\")", "target_code": "sns.distplot(X[:, 1], kde=True, color=\"r\", ax=ax2)\nax2.set_title(\"X_2\")\nax2.set_xlabel(\"Value\")\nax2.set_ylabel(\"Frequency\")\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Working with Data\n#\n# One of the most important aspects of designing any machine learning system is to __understand your data__. In this notebook, we're going to load and visualize some very basic datasets so that you understand some of the various tools available to you in Python for working with data. You should __always try to visualize your data__ before you use it in any type of algorithm. First we will look at a random dataset, and then we'll use `sklearn` to study the Iris dataset and other famous datasets.\n#\n# _Note: some code segments have TODO comments in them. These comments are optional exercises for you to modify the code in a useful way, however they are not meant to be restrictive. Feel free to modify the code in this notebook any way you like; it's a great way to practice your coding skills._\n#\n# ## Getting Started\n#\n# You should have your own Anaconda virtual environment with all of the necessary Python modules installed. You can check by trying to import them:\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport os\nimport pandas as pd\nimport seaborn as sns\nimport skimage.io\nimport sklearn\nimport sklearn.datasets\n\n\n# ## Generating Random Data\n#\n# Let's begin by generating a random dataset of 2-D data points.\n\n\n# you can set a random seed to get \"reproducible\" randomness\nnp.random.seed(912)\n\n# define a function to generate random data\n\n\ndef random_data(n_samples):\n \"\"\"\n Generates random data in two dimensions.\n Returns a (n_samples, 2) numpy array.\n \"\"\"\n # define the parameters (mean, covariance) of a normal distribution\n mean = [0.5, 0.5]\n cov = [[1, 0], [0, 100]]\n\n # generate n_samples of 2D data\n X = np.random.multivariate_normal(mean, cov, size=n_samples)\n\n return X\n\n\n# Let's see what this data looks like. We'll start small and draw 50 samples from our function.\n\n\nX = random_data(50)\nprint(\"Dataset shape: (%d, %d)\" % X.shape)\nprint(X)\n\n\n# Awesome! Now we have some data to play with.\n#\n# ## Plotting Univariate Data\n#\n# Let's try to generate some more samples and then plot them.\n\n\n# generate 1000 random samples\nX = random_data(1000)\n\n\n# One of the most basic aspects of a dataset is its __dimensionality__. For example, if a dataset is 2-dimensional, it means that the dataset has two variables that are influencing the outcomes. These dimensions are also called the __features__ of a dataset. In this case, `X` is our dataset, and it has two features (we'll call them \"X_1\" and \"X_2\"). Let's plot each of these features individually using `matplotlib`:\n\n\n# generate indices for X\nidx = range(len(X))\n\n# initialize a 2x1 figure for plotting\n_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharey=True)\n\n# plot X_1 on the left\nax1.scatter(idx, X[:, 0], c=\"b\")\nax1.set_title(\"X_1\")\nax1.set_xlabel(\"Index\")\nax1.set_ylabel(\"Value\")\n\n# plot X_2 on the right\nax2.scatter(idx, X[:, 1], c=\"r\")\nax2.set_title(\"X_2\")\nax2.set_xlabel(\"Index\")\nax2.set_ylabel(\"Value\")\n\n# display the plot\nplt.show()\n\n\n# Here we've simply plotted all of the samples in a line, separately for each feature. This visualization is fairly straightforward, but it's not how we typically look at data of this type. That is, since this dataset consists of many independent samples, a better way to visualize this data is to use a __histogram__. The `matplotlib` library has a histogram function called `plt.hist()`, but here we're going to use another library called `seaborn`, which provides some fancier plotting functions on top of `matplotlib`.\n\n\n# initialize a 2x1 figure for plotting\n_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharex=True)\n\n# plot histogram of X_1 on the left\nsns.distplot(X[:, 0], kde=True, color=\"b\", ax=ax1)\nax1.set_title(\"X_1\")\nax1.set_xlabel(\"Value\")\nax1.set_ylabel(\"Frequency\")\n", "project_metadata": {"full_name": "CUFCTL/creative-inquiry", "description": "Repository for FCTL creative inquiries", "topics": [], "git_url": "git://github.com/CUFCTL/creative-inquiry.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-04-19T14:19:33Z", "size": 1403, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 323042, "TeX": 5727}, "last_updated": "2020-12-09T18:24:40Z"}, "intent": "# plot histogram of X_2 on the right"}, {"original_comment": "# for sentence in the corpus predict its scores\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n#%%\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n#%%\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n#%%\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n#%%\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n#%%\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\n\n\n#%%\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n#%%\n\ncorpus\n\n#%%\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n#%%\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n#%%\n\npred = []", "target_code": "for review in corpus:\n score = afinn.score(review)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\n\n\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n\ncorpus\n\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n\npred = []\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "intent": "# for sentence in the corpus predict its scores"}, {"original_comment": "# normlize shapes\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Animating and Smoothing 3D Facial Keypoints\n# **Note:** if you are impatient, please **press the \"output\" tab** to look at a few giff's that are the final result of this script (please wait a bit for the giff's to load, and then they will be displayed at normal speed)\n#\n# In this script I will build upon the very pretty visualizations in [DrGuillermo's 3D Animation Script](https://www.kaggle.com/drgilermo/3d-kmeans-animation) and provide a utility function to draw 3D shape animations with a surrounding 3D bounding box. In this script I also provide several additional utility functions to aid the process of working with this dataset: one function to normalize shapes (2D or 3D) and an additional one to write videos for visualization.\n#\n# After showing an animation of the movment of facial keypoints in 3D, we then continue to filter some of the noise in the shape keypoints data by utilizing the spatial correlations across the dataset and create animations of the denoised keypoints, resulting in a much smoother and nicer animations.\n#\n# We then continue to apply temporal filtering on the denoised keypoint coordinates, resulting in even smoother animations.\n#\n# Finally, we verify that the filtering opperations didn't ruin anything by overlaying them on the original videos and looking at the differences. We conclude that the process of spatio-temporal filtering does produce a much nicer and cleaner anontation.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nfrom sklearn import decomposition\nfrom scipy import signal\nimport matplotlib.pyplot as plt\nimport matplotlib.animation as animation\nimport matplotlib\nimport plotly.offline as py\nimport plotly.graph_objs as go\n#import imageio\nimport glob\n\npy.init_notebook_mode(connected=True)\n\n\n# # Load the Data\n\n#%%\n\nvideoDF = pd.read_csv('../input/youtube_faces_with_keypoints_large.csv')\n\n# create a dictionary that maps videoIDs to full file paths\nnpzFilesFullPath = glob.glob('../input/youtube_faces_*/*.npz')\nvideoIDs = [x.split('/')[-1].split('.')[0] for x in npzFilesFullPath]\nfullPaths = {}\nfor videoID, fullPath in zip(videoIDs, npzFilesFullPath):\n fullPaths[videoID] = fullPath\n\n# remove from the large csv file all videos that weren't uploaded yet\nvideoDF = videoDF.loc[videoDF.loc[:, 'videoID'].isin(\n fullPaths.keys()), :].reset_index(drop=True)\nprint('Number of Videos uploaded so far is %d' % (videoDF.shape[0]))\nprint('Number of Unique Individuals so far is %d' %\n (len(videoDF['personName'].unique())))\n\n\n# # Show Overview of Dataset Content (that has been uploaded so far)\n\n#%%\n\n# overview of the contents of the dataset\ngroupedByPerson = videoDF.groupby(\"personName\")\nnumVidsPerPerson = groupedByPerson.count()['videoID']\ngroupedByPerson.count().sort_values('videoID', axis=0, ascending=False)\n\nplt.close('all')\nplt.figure(figsize=(25, 20))\nplt.subplot(2, 2, 1)\nplt.hist(x=numVidsPerPerson, bins=0.5 +\n np.arange(numVidsPerPerson.min()-1, numVidsPerPerson.max()+1))\nplt.title('Number of Videos per Person', fontsize=30)\nplt.xlabel('Number of Videos', fontsize=25)\nplt.ylabel('Number of People', fontsize=25)\n\nplt.subplot(2, 2, 2)\nplt.hist(x=videoDF['videoDuration'], bins=20)\nplt.title('Distribution of Video Duration', fontsize=30)\nplt.xlabel('duration [frames]', fontsize=25)\nplt.ylabel('Number of Videos', fontsize=25)\nplt.xlim(videoDF['videoDuration'].min()-2, videoDF['videoDuration'].max()+2)\n\nplt.subplot(2, 2, 3)\nplt.scatter(x=videoDF['imageWidth'], y=videoDF['imageHeight'])\nplt.title('Distribution of Image Sizes', fontsize=30)\nplt.xlabel('Image Width [pixels]', fontsize=25)\nplt.ylabel('Image Height [pixels]', fontsize=25)\nplt.xlim(0, videoDF['imageWidth'].max() + 15)\nplt.ylim(0, videoDF['imageHeight'].max()+15)\n\nplt.subplot(2, 2, 4)\naverageFaceSize_withoutNaNs = np.array(videoDF['averageFaceSize'])\naverageFaceSize_withoutNaNs = averageFaceSize_withoutNaNs[np.logical_not(\n np.isnan(averageFaceSize_withoutNaNs))]\nplt.hist(averageFaceSize_withoutNaNs, bins=20)\nplt.title('Distribution of Average Face Sizes ', fontsize=30)\nplt.xlabel('Average Face Size [pixels]', fontsize=25)\nplt.ylabel('Number of Videos', fontsize=25)\n\n\n# # Define some shape normalization utility functions\n\n#%%\n\n# %% define shape normalization utility functions\ndef NormlizeShapes(shapesImCoords):\n (numPoints, numDims, _) = shapesImCoords.shape\n \"\"\"shapesNomalized, scaleFactors, meanCoords = NormlizeShapes(shapesImCoords)\"\"\"\n\n # calc mean coords and subtract from shapes\n meanCoords = shapesImCoords.mean(axis=0)\n shapesCentered = np.zeros(shapesImCoords.shape)\n shapesCentered = shapesImCoords - np.tile(meanCoords, [numPoints, 1, 1])\n\n # calc scale factors and divide shapes\n scaleFactors = np.sqrt((shapesCentered**2).sum(axis=1)).mean(axis=0)\n shapesNormlized = np.zeros(shapesCentered.shape)\n shapesNormlized = shapesCentered / \\\n np.tile(scaleFactors, [numPoints, numDims, 1])\n\n return shapesNormlized, scaleFactors, meanCoords\n\n\ndef TransformShapeBackToImageCoords(shapesNomalized, scaleFactors, meanCoords):\n \"\"\"shapesImCoords_rec = TransformShapeBackToImageCoords(shapesNomalized, scaleFactors, meanCoords)\"\"\"\n (numPoints, numDims, _) = shapesNomalized.shape\n\n # move back to the correct scale\n shapesCentered = shapesNomalized * \\\n np.tile(scaleFactors, [numPoints, numDims, 1])\n # move back to the correct location\n shapesImCoords = shapesCentered + np.tile(meanCoords, [numPoints, 1, 1])\n\n return shapesImCoords\n\n\n# # Normalize the 2D and 3D Shapes\n# remember that like we showed in the [Exploration Script](https://www.kaggle.com/selfishgene/exploring-youtube-faces-with-keypoints-dataset), in order to compare apples to apples (or in this case, shapes to shapes), we need first to normalize the shapes in and manually remove the things that we don't care about (in this case, we want to disregard translation and scale differences between shapes, and model only the shape's shape :-) )\n\n#%%\n\n# %% Normalize 2D and 3D shapes\n\n# collect all 2D and 3D shapes from all frames from all videos to a single numpy array matrix\ntotalNumberOfFrames = videoDF['videoDuration'].sum()\nlandmarks2D_all = np.zeros((68, 2, int(totalNumberOfFrames)))\nlandmarks3D_all = np.zeros((68, 3, int(totalNumberOfFrames)))\n\nshapeIndToVideoID = {} # dictionary for later useage\nendInd = 0\nfor i, videoID in enumerate(videoDF['videoID']):\n\n # load video\n videoFile = np.load(fullPaths[videoID])\n landmarks2D = videoFile['landmarks2D']\n landmarks3D = videoFile['landmarks3D']\n\n startInd = endInd\n endInd = startInd + landmarks2D.shape[2]\n\n # store in one big array\n landmarks2D_all[:, :, startInd:endInd] = landmarks2D\n landmarks3D_all[:, :, startInd:endInd] = landmarks3D\n\n # make sure we keep track of the mapping to the original video and frame\n for videoFrameInd, shapeInd in enumerate(range(startInd, endInd)):\n shapeIndToVideoID[shapeInd] = (videoID, videoFrameInd)", "target_code": "landmarks2D_normlized, _, _ = NormlizeShapes(landmarks2D_all)\nlandmarks3D_normlized, _, _ = NormlizeShapes(landmarks3D_all)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Animating and Smoothing 3D Facial Keypoints\n# **Note:** if you are impatient, please **press the \"output\" tab** to look at a few giff's that are the final result of this script (please wait a bit for the giff's to load, and then they will be displayed at normal speed)\n#\n# In this script I will build upon the very pretty visualizations in [DrGuillermo's 3D Animation Script](https://www.kaggle.com/drgilermo/3d-kmeans-animation) and provide a utility function to draw 3D shape animations with a surrounding 3D bounding box. In this script I also provide several additional utility functions to aid the process of working with this dataset: one function to normalize shapes (2D or 3D) and an additional one to write videos for visualization.\n#\n# After showing an animation of the movment of facial keypoints in 3D, we then continue to filter some of the noise in the shape keypoints data by utilizing the spatial correlations across the dataset and create animations of the denoised keypoints, resulting in a much smoother and nicer animations.\n#\n# We then continue to apply temporal filtering on the denoised keypoint coordinates, resulting in even smoother animations.\n#\n# Finally, we verify that the filtering opperations didn't ruin anything by overlaying them on the original videos and looking at the differences. We conclude that the process of spatio-temporal filtering does produce a much nicer and cleaner anontation.\n\n\nimport numpy as np\nimport pandas as pd\nfrom sklearn import decomposition\nfrom scipy import signal\nimport matplotlib.pyplot as plt\nimport matplotlib.animation as animation\nimport matplotlib\nimport plotly.offline as py\nimport plotly.graph_objs as go\n#import imageio\nimport glob\n\npy.init_notebook_mode(connected=True)\n\n\n# # Load the Data\n\n\nvideoDF = pd.read_csv('../input/youtube_faces_with_keypoints_large.csv')\n\n# create a dictionary that maps videoIDs to full file paths\nnpzFilesFullPath = glob.glob('../input/youtube_faces_*/*.npz')\nvideoIDs = [x.split('/')[-1].split('.')[0] for x in npzFilesFullPath]\nfullPaths = {}\nfor videoID, fullPath in zip(videoIDs, npzFilesFullPath):\n fullPaths[videoID] = fullPath\n\n# remove from the large csv file all videos that weren't uploaded yet\nvideoDF = videoDF.loc[videoDF.loc[:, 'videoID'].isin(\n fullPaths.keys()), :].reset_index(drop=True)\nprint('Number of Videos uploaded so far is %d' % (videoDF.shape[0]))\nprint('Number of Unique Individuals so far is %d' %\n (len(videoDF['personName'].unique())))\n\n\n# # Show Overview of Dataset Content (that has been uploaded so far)\n\n\n# overview of the contents of the dataset\ngroupedByPerson = videoDF.groupby(\"personName\")\nnumVidsPerPerson = groupedByPerson.count()['videoID']\ngroupedByPerson.count().sort_values('videoID', axis=0, ascending=False)\n\nplt.close('all')\nplt.figure(figsize=(25, 20))\nplt.subplot(2, 2, 1)\nplt.hist(x=numVidsPerPerson, bins=0.5 +\n np.arange(numVidsPerPerson.min()-1, numVidsPerPerson.max()+1))\nplt.title('Number of Videos per Person', fontsize=30)\nplt.xlabel('Number of Videos', fontsize=25)\nplt.ylabel('Number of People', fontsize=25)\n\nplt.subplot(2, 2, 2)\nplt.hist(x=videoDF['videoDuration'], bins=20)\nplt.title('Distribution of Video Duration', fontsize=30)\nplt.xlabel('duration [frames]', fontsize=25)\nplt.ylabel('Number of Videos', fontsize=25)\nplt.xlim(videoDF['videoDuration'].min()-2, videoDF['videoDuration'].max()+2)\n\nplt.subplot(2, 2, 3)\nplt.scatter(x=videoDF['imageWidth'], y=videoDF['imageHeight'])\nplt.title('Distribution of Image Sizes', fontsize=30)\nplt.xlabel('Image Width [pixels]', fontsize=25)\nplt.ylabel('Image Height [pixels]', fontsize=25)\nplt.xlim(0, videoDF['imageWidth'].max() + 15)\nplt.ylim(0, videoDF['imageHeight'].max()+15)\n\nplt.subplot(2, 2, 4)\naverageFaceSize_withoutNaNs = np.array(videoDF['averageFaceSize'])\naverageFaceSize_withoutNaNs = averageFaceSize_withoutNaNs[np.logical_not(\n np.isnan(averageFaceSize_withoutNaNs))]\nplt.hist(averageFaceSize_withoutNaNs, bins=20)\nplt.title('Distribution of Average Face Sizes ', fontsize=30)\nplt.xlabel('Average Face Size [pixels]', fontsize=25)\nplt.ylabel('Number of Videos', fontsize=25)\n\n\n# # Define some shape normalization utility functions\n\n\n# %% define shape normalization utility functions\ndef NormlizeShapes(shapesImCoords):\n (numPoints, numDims, _) = shapesImCoords.shape\n \"\"\"shapesNomalized, scaleFactors, meanCoords = NormlizeShapes(shapesImCoords)\"\"\"\n\n # calc mean coords and subtract from shapes\n meanCoords = shapesImCoords.mean(axis=0)\n shapesCentered = np.zeros(shapesImCoords.shape)\n shapesCentered = shapesImCoords - np.tile(meanCoords, [numPoints, 1, 1])\n\n # calc scale factors and divide shapes\n scaleFactors = np.sqrt((shapesCentered**2).sum(axis=1)).mean(axis=0)\n shapesNormlized = np.zeros(shapesCentered.shape)\n shapesNormlized = shapesCentered / \\\n np.tile(scaleFactors, [numPoints, numDims, 1])\n\n return shapesNormlized, scaleFactors, meanCoords\n\n\ndef TransformShapeBackToImageCoords(shapesNomalized, scaleFactors, meanCoords):\n \"\"\"shapesImCoords_rec = TransformShapeBackToImageCoords(shapesNomalized, scaleFactors, meanCoords)\"\"\"\n (numPoints, numDims, _) = shapesNomalized.shape\n\n # move back to the correct scale\n shapesCentered = shapesNomalized * \\\n np.tile(scaleFactors, [numPoints, numDims, 1])\n # move back to the correct location\n shapesImCoords = shapesCentered + np.tile(meanCoords, [numPoints, 1, 1])\n\n return shapesImCoords\n\n\n# # Normalize the 2D and 3D Shapes\n# remember that like we showed in the [Exploration Script](https://www.kaggle.com/selfishgene/exploring-youtube-faces-with-keypoints-dataset), in order to compare apples to apples (or in this case, shapes to shapes), we need first to normalize the shapes in and manually remove the things that we don't care about (in this case, we want to disregard translation and scale differences between shapes, and model only the shape's shape :-) )\n\n\n# %% Normalize 2D and 3D shapes\n\n# collect all 2D and 3D shapes from all frames from all videos to a single numpy array matrix\ntotalNumberOfFrames = videoDF['videoDuration'].sum()\nlandmarks2D_all = np.zeros((68, 2, int(totalNumberOfFrames)))\nlandmarks3D_all = np.zeros((68, 3, int(totalNumberOfFrames)))\n\nshapeIndToVideoID = {} # dictionary for later useage\nendInd = 0\nfor i, videoID in enumerate(videoDF['videoID']):\n\n # load video\n videoFile = np.load(fullPaths[videoID])\n landmarks2D = videoFile['landmarks2D']\n landmarks3D = videoFile['landmarks3D']\n\n startInd = endInd\n endInd = startInd + landmarks2D.shape[2]\n\n # store in one big array\n landmarks2D_all[:, :, startInd:endInd] = landmarks2D\n landmarks3D_all[:, :, startInd:endInd] = landmarks3D\n\n # make sure we keep track of the mapping to the original video and frame\n for videoFrameInd, shapeInd in enumerate(range(startInd, endInd)):\n shapeIndToVideoID[shapeInd] = (videoID, videoFrameInd)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# normlize shapes"}, {"original_comment": "# compute t-SNE of X\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Unsupervised Learning\n#\n# In the \"Supervised Learning\" notebook we introduced the idea of a \"task\" and we studied the family of supervised learning tasks, in which the goal is to learn some kind of mapping from labeled data. This description probably begs the question -- what if the data is unlabeled? Would we even be able to do anything with it? It turns out there are a number of things we can still learn about the data; algorithms that work with unlabeled data are called __unsupervised learning__ algorithms.\n#\n# In unsupervised learning, the task is to learn something about the structure of the data. There are a lot of ways we can define the kind of \"structure\" that we want. For example, we could try to group data points into categories based on their values; this task is called __clustering__. As another example, we could take a high-dimensional dataset and try to respresent it somehow in 2D or 3D so that we can visualize it; this task is called __dimensionality reduction__. In this notebook we will look at these two tasks, as they are some of the most common in unsupervised learning. We will continue to use the Iris dataset and other toy datasets from scikit-learn and seaborn.\n#\n# _Note: some code segments have TODO comments in them. These comments are optional exercises for you to modify the code in a useful way, however they are not meant to be restrictive. Feel free to modify the code in this notebook any way you like; it's a great way to practice your coding skills._\n#\n# ## Getting Started\n#\n# You should have your own Anaconda virtual environment with all of the necessary Python modules installed. You can check by trying to import them:\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport sklearn\nimport sklearn.cluster\nimport sklearn.datasets\nimport sklearn.decomposition\nimport sklearn.manifold\nimport sklearn.metrics\nimport sklearn.model_selection\nimport sklearn.neighbors\nimport sklearn.pipeline\n\n\n# ## Clustering: Iris Dataset\n#\n# So how would we \"cluster\" the Iris dataset? And why would we want to in the first place? After all, we already have the labels, so we already know what category each sample should belong to. As it turns out, this is a great way to evaluate a clustering algorithm -- if we put aside the labels and cluster the dataset using only the data, we can then compare the clusters identified by the algorithm with the original labels. Remember that one of the first questions we must ask before trying a supervised learning task is whether there is truly a pattern in the data, and clustering algorithms allow us to answer this question.\n#\n# To do this we will take the entire dataset, use a clustering algorithm to identify cluster labels for each sample in the dataset, and then try to compare the labels to the true labels.\n#\n# ### K-Means Clustering\n#\n# One of the most commonly-used clustering algorithms is __k-means clustering__, because it is fast and relatively easy to understand. The algorithm is as follows:\n#\n# 1. Select $k$ random samples from the dataset to be the initial means\n# 2. Assign a label to each sample in the dataset according to the mean which is nearest to it\n# 3. Update each mean to be the centroid of the samples in its cluster\n# 4. Repeat steps 2 and 3 until the cluster labels converge\n#\n# Here's an example from Wikipedia of k-means in action:\n#\n# \"kmeans\"\n#\n# This algorithm does seem to take the same approach to clustering that k-nearest neighbors takes to classification. In fact, it kind of has the same hyperparameters:\n# 1. The number of clusters $k$\n# 2. The distance function, which is used to find the nearest mean to a sample\n#\n# As with kNN, we will use Euclidean distance by default. As for the number of clusters $k$, since we already know that the Iris dataset has three classes, we will use $k = 3$.\n\n#%%\n\n# load Iris dataset\niris = sklearn.datasets.load_iris()\nX = iris.data\ny = iris.target\n\n# initialize k-means model\nkmeans = sklearn.cluster.KMeans(n_clusters=3, n_jobs=-1)\n\n# fit the model to the dataset\nkmeans.fit(X)\n\n# compute cluster labels for the dataset\ny_pred = kmeans.predict(X)\n\n# show a side-by-side comparison of cluster labels and true labels\nprint(np.c_[y, y_pred])\n\n\n# It looks like the clustering model was able to cluster the dataset into the three classes, but there's one problem: the cluster indices don't necessarily match up to the class indices. In other words, the clustering model doesn't know what each cluster _is_, it only knows how the data points are clustered. What we need is a way to measure the _similarity_ of the cluster labels and the ground truth labels by accounting for permutations. One metric which provides this kind of measure is the __adjusted Rand index (ARI)__:\n\n#%%\n\nari = sklearn.metrics.adjusted_rand_score(y, y_pred)\n\nprint(\"%0.3f\" % (ari))\n\n\n# The ARI ranges from -1 to +1, where +1 is perfect similarity. We can also use the scatter plots we developed in the \"Working with Data\" notebook to visually compare the cluster labels with the ground truth labels:\n\n#%%\n\n# define function to plot a slice of Iris dataset\ndef plot_iris_2d(iris, columns, labels):\n # extract x and y axes\n x, y = iris.data[:, columns[0]], iris.data[:, columns[1]]\n\n # plot x and y\n plt.scatter(x, y, c=labels)\n plt.xlabel(iris.feature_names[columns[0]])\n plt.ylabel(iris.feature_names[columns[1]])\n\n\n# create side-by-side comparison of cluster labels and true labels\nplt.subplots(1, 2, figsize=(10, 5))\n\nplt.subplot(121)\nplot_iris_2d(iris, [0, 2], y_pred)\nplt.title(\"Cluster Labels\")\n\nplt.subplot(122)\nplot_iris_2d(iris, [0, 2], iris.target)\nplt.title(\"Ground Truth Labels\")\n\nplt.show()\n\n# TODO: change the value of k and observe its effect on the ARI and the scatter plots\n\n\n# Now we can see where the clustering model \"mis-clustered\" data points. Note that the colors will match up only if the clustering model happened to assign clusters in the same order as the ground truth labels.\n#\n# It seems like clustering is pretty easy when we have the ground truth labels. But what if we didn't? What if Ronald Fisher had neglected to label each flower that he measured? We would then have just a set of 150 flower measurements with no idea of the species of each flower. Could we use clustering to determine the species? The problem is that in this scenario we don't know the number of species -- in other words, we don't know the value of $k$. We could do a hyperparameter search on $k$, but how would we compare each model? The ARI won't help us here because, again, we don't have the ground truth labels.\n#\n# In the absence of ground truth labels, the primary evaluation metric that we can use for k-means is __inertia__, or within-class scatter. The inertia is the sum of the variance in each cluster; a lower inertia generally corresponds to more coherent clusters, so we will seek a value of $k$ which minimizes the inertia:\n\n#%%\n\n# evaluate k-means for several values of k\nk_values = range(1, 11)\n\nfor k in k_values:\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n y_pred = model.fit_predict(X)\n\n print(\"k = %2d: %0.2f\" % (k, model.inertia_))\n\n\n# Hmm... it seems like the inertia just keeps decreasing as $k$ increases. Well, the largest possible value of $k$ is the number of data points, so let's try that:\n\n#%%\n\nk = iris.data.shape[0]\nmodel = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\ny_pred = model.fit_predict(X)\n\nprint(\"k = %2d: %0.2f\" % (k, model.inertia_))\n\n\n# If we assume that there are as many clusters as there are data points, we effectively assign each data point to its own cluster. Since each cluster then has only one data point, the variance of each cluster is 0 and so the total variance, or inertia, is also 0. But this result is not helpful to us. It turns out that the inertia has an inherent __bias__ toward more complex models. The most common way to deal with this bias is to use what's called the __elbow method__, which is best explained with a plot:\n\n#%%\n\ndef evaluate_kmeans(k):\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n model.fit_predict(X)\n\n return model.inertia_\n\n\nx = range(1, 21)\ny = [evaluate_kmeans(k) for k in x]\n\nplt.plot(x, y)\nplt.xticks(x)\nplt.show()\n\n\n# As the plot shows, even though the inertia decreases indefinitely, it also begins to level off, creating an \"elbow\". The elbow method takes the value of $k$ at which this elbow occurs to be the best value of $k$. The idea is that this value gives us the best \"bang for our buck\" -- the simplest model at which the minimum inertia occurs (more or less). But where exactly does the elbow occur? Is it 3? Or 4? Unfortunately, the elbow method isn't an exact method. But can we really be upset? After all, without ground truth labels, the number of clusters in a dataset is quite subjective:\n\n#%%\n\nk_values = [2, 3, 4, 5]\n\nplt.subplots(1, len(k_values), figsize=(5 * len(k_values), 5))\n\nfor i in range(len(k_values)):\n k = k_values[i]\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n labels = model.fit_predict(X)\n\n plt.subplot(1, len(k_values), i + 1)\n plot_iris_2d(iris, [0, 2], labels)\n plt.title(\"k = %d\" % (k))\n\nplt.show()\n\n\n# How many clusters do you see? Can you tell which plot is the most \"correct\"? Clustering becomes much more difficult when we don't have ground truth labels or the number of clusters, because we don't have an objective way to select the best model. This issue pervades virtually every clustering algorithm in existence: __what is the right number of clusters for a dataset__? The philosophical answer is that there is no such thing! Clustering is unsupervised learning; in other words, we don't know what we're looking for. The best we can do is to hand-craft our own metrics based on ideas like inertia, and to use what hints we can get from the task we are trying to do.\n#\n# For a more in-depth overview of k-means and the many other clustering algorithms out there, we refer you to the [scikit-learn documentation](http://scikit-learn.org/stable/modules/clustering.html#clustering). Feel free to try out some of these algorithms yourself on the Iris dataset! A good algorithm to try first is the __Gaussian mixture model__, which is very similar to k-means.\n\n# ## Dimensionality Reduction: Digits Dataset\n#\n# We have seen many times by now that datasets tend to have a lot of features, almost always more than 2 or 3, which prevents us from being able to visualize them wholistically. Furthermore, as the number of features in a dataset increases, it becomes harder to extract useful information from the data, to separate \"signal\" from \"noise\", so to speak. We call this phenomenon the __curse of dimensionality__. It is especially true for image datasets, since images can easily contain thousands of pixels. Dimensionality reduction techniques can help us with both of these problems: we can transform a dataset into 2 or 3 dimensions for __visualization__, or more generally we can perform __feature extraction__, in which we transform a dataset into some lower-dimensional space that should contain more signal and less noise.\n#\n# To present these techniques we will use the digits dataset provided by scikit-learn. This dataset consists of 8x8 grayscale images of handwritten digits. We'll go ahead and load the dataset:\n\n#%%\n\n# load the digits dataset\ndigits = sklearn.datasets.load_digits()\n\n# print dataset stats\nprint(\"X: (%d, %d)\" % digits.data.shape)\nprint(\"y: (%d,)\" % digits.target.shape)\nprint(\"label names:\", digits.target_names)\n\n#%%\n\nsns.heatmap(digits.images[0], cmap=\"Greys\")\nplt.show()\n\n\n# These images are very small, and they use only 16 shades of grey. Later on we will look at another dataset of handwritten digits which is much more widely-known, and much larger. But for now this dataset will suffice.\n\n# ### Principal Component Analysis (PCA)\n#\n# PCA is a classic dimensionality reduction technique. It has been used for all sorts of things, including visualization, pattern recognition, data compression... and so on. PCA computes the __principal components__ of a dataset. In mathematical terms, the principal components of a dataset $X$ are the eigenvectors of the covariance of $X$:\n#\n# $$W_{pca} = V,$$\n# $$\\Sigma = X X^T = V U V^T$$\n#\n# Intuitively, the principal components of a dataset are the axes along which the variance of the dataset is maximized. They are essentially just features, based on the original features of the dataset. A dataset can have as many principal components as it has features, but we typically only take the $N$ most relevant principal components; this truncation is how we reduce the dimensionality of the data. In particular, we use the (truncated) principal component matrix to project each sample $\\vec{x}$ into a lower-dimensional space:\n#\n# $$\\vec{x}_{proj} = W_{pca} \\vec{x}$$\n#\n# Here's a real-world example. You're in a classroom filled with students, and you want to create a system that can distinguish between each individual student using a set of features. You can use whatever features you want: gender, age, height, weight, skin color, hair color, beard, glasses, clothing, nationality... anything. In this situation, the most __salient__ or useful features would be the ones for which there is a lot of variation between individuals; for example, gender wouldn't be a very useful feature in a room full of guys, but nationality would be very useful in a room full of people from many different nations. In other words, you want the features which exhibit the greatest __variance__ in your dataset of people. PCA attempts to find these features, except that it is not limited to the features themselves; it can also compute new features by using linear combinations of the original features.\n#\n# Now let's try to understand how PCA works with some code:\n\n#%%\n\n# get the data matrix from the digits dataset\nX = digits.data\n\n# create PCA model\npca = sklearn.decomposition.PCA()\n\n# compute principal components of X\npca.fit(X)\n\n# print stats\nprint(pca.components_.shape)\n\n#%%\n\n# plot the explained variance of each principal component\nx = range(pca.n_components_)\ny = pca.explained_variance_ratio_\n\nplt.plot(x, y)\nplt.title(\"Explained Variance of Principal Components\")\nplt.xlabel(\"Component\")\nplt.ylabel(\"Explained Variance Ratio\")\nplt.show()\n\n\n# In the above code we simply compute the principal components of the digits dataset and we plot the explained variance of each component. Components with the highest explained variance are the \"salient features\" that we were talking about; in this case, the first principal component contains nearly 15% of the variance of the entire dataset. Notice that the components are sorted by their explained variance, which makes it easy to take the $n$ components with the most variance:\n\n#%%\n\n# plot the total variance contained within the first n components\nx = range(pca.n_components_)\ny = [100 * sum(pca.explained_variance_ratio_[:n]) for n in x]\n\nplt.plot(x, y)\nplt.title(\"Total Explained Variance of Principal Components\")\nplt.xlabel(\"Number of Components\")\nplt.ylabel(\"Total Explained Variance (%)\")\nplt.show()\n\n\n# Earlier we presented the idea of separating the signal from the noise in a dataset; variance is one of the simplest ways to determine where such signal occurs. Much of the useful information in a dataset is in its variations; if we can capture most of a dataset's variance with only a subset of features, we can essentially filter out much of the noise without losing very much signal. In this example, we can take the first 20 components, which reduces the dataset from 64 dimensions to 20 dimensions, and still retain roughly 90% of the information in the dataset. This reduction is useful from two perspectives: on the one hand, we have found a more compact representation of the data (sort of like lossy compression), and on the other hand, we have found features that may be more useful for tasks such as classification.\n#\n# In fact, let's try it -- let's create a classifier and see if using PCA as a preprocessing step improves classification accuracy:\n\n#%%\n\n# define a function to evaluate a kNN classifier (with optional PCA)\ndef evaluate_model(X_train, X_test, y_train, y_test, n_pca=0):\n # create the model\n model = None\n\n if n_pca != 0:\n # if n_pca is specified then create a PCA/kNN model\n model = sklearn.pipeline.Pipeline([\n (\"pca\", sklearn.decomposition.PCA(n_components=n_pca)),\n (\"knn\", sklearn.neighbors.KNeighborsClassifier(1))\n ])\n else:\n # otherwise just use kNN\n model = sklearn.neighbors.KNeighborsClassifier(1)\n\n # train the model\n model.fit(X_train, y_train)\n\n # evaluate the model\n return model.score(X_test, y_test)\n\n\n# create train and test sets\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n digits.data, digits.target, test_size=0.3)\n\n# compare the accuracy of several classifiers by varying the number of principal components\n# a value of 0 means don't use PCA\npca_values = [1, 5, 10, 20, 64, 0]\n\nfor n_pca in pca_values:\n accuracy = 100 * evaluate_model(X_train, X_test, y_train, y_test, n_pca)\n\n print(\"Accuracy (n_pca = %2d): %5.1f%%\" % (n_pca, accuracy))\n\n\n# Well, we didn't really _improve_ the accuracy. The thing is that this dataset is simple enough that kNN can do quite well on its own; to really test the benefit of PCA we would need a more complex dataset, one where kNN doesn't get 98% accuracy out of the box. The more important result here is that _we maintained the same level of accuracy using fewer dimensions_ -- the first 20 principal components provides the same level of classification potential as the original 64 features.\n#\n# We can also use principal components to visualize the dataset. Remember, principal components are like axes, so we can pick any two principal components and project each data point onto them:\n\n#%%\n\n# select two principal components by index\nindices = [0, 1]\n\n# project each data point onto the selected axes\nX_proj = pca.transform(X)\n\nx, y = X_proj[:, indices[0]], X_proj[:, indices[1]]\n\n# plot the projected data\nplt.scatter(x, y, c=digits.target, cmap=\"hsv\")\nplt.xlabel(\"Principal Component %d\" % (indices[0]))\nplt.ylabel(\"Principal Component %d\" % (indices[1]))\nplt.show()\n\n# TODO: try to find the pair of principal components that best separate the colors\n\n\n# We went ahead and colored each data points by class. Remember that this dataset consists of images of handwritten digits, so each data point is an image and the class denotes which digit it is. But even with the first two components, the 10 classes are still pretty mixed up. We could separate the classes pretty well if we could use more components -- we know this from the classification experiment that we just ran -- but when we're limited to only two components it's hard to get good class separation. For this reason, PCA is generally pretty limited as a visualization tool, unless the first two components have a very high explained variance.\n#\n# ### t-distributed Stochastic Neighbor Embedding (t-SNE)\n#\n# When it comes to visualizing high-dimensional data, t-SNE is hands-down the most widely-used tool, and arguably the most effective. It is also way more complicated mathematically so we won't even begin to try and explain it. Just know that it can take your high-dimensional dataset and magically put it into 2D or 3D space. Observe:\n\n#%%\n\n# get the data matrix from the digits dataset\nX = digits.data", "target_code": "X_embedded = sklearn.manifold.TSNE(n_components=2).fit_transform(X)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Unsupervised Learning\n#\n# In the \"Supervised Learning\" notebook we introduced the idea of a \"task\" and we studied the family of supervised learning tasks, in which the goal is to learn some kind of mapping from labeled data. This description probably begs the question -- what if the data is unlabeled? Would we even be able to do anything with it? It turns out there are a number of things we can still learn about the data; algorithms that work with unlabeled data are called __unsupervised learning__ algorithms.\n#\n# In unsupervised learning, the task is to learn something about the structure of the data. There are a lot of ways we can define the kind of \"structure\" that we want. For example, we could try to group data points into categories based on their values; this task is called __clustering__. As another example, we could take a high-dimensional dataset and try to respresent it somehow in 2D or 3D so that we can visualize it; this task is called __dimensionality reduction__. In this notebook we will look at these two tasks, as they are some of the most common in unsupervised learning. We will continue to use the Iris dataset and other toy datasets from scikit-learn and seaborn.\n#\n# _Note: some code segments have TODO comments in them. These comments are optional exercises for you to modify the code in a useful way, however they are not meant to be restrictive. Feel free to modify the code in this notebook any way you like; it's a great way to practice your coding skills._\n#\n# ## Getting Started\n#\n# You should have your own Anaconda virtual environment with all of the necessary Python modules installed. You can check by trying to import them:\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport sklearn\nimport sklearn.cluster\nimport sklearn.datasets\nimport sklearn.decomposition\nimport sklearn.manifold\nimport sklearn.metrics\nimport sklearn.model_selection\nimport sklearn.neighbors\nimport sklearn.pipeline\n\n\n# ## Clustering: Iris Dataset\n#\n# So how would we \"cluster\" the Iris dataset? And why would we want to in the first place? After all, we already have the labels, so we already know what category each sample should belong to. As it turns out, this is a great way to evaluate a clustering algorithm -- if we put aside the labels and cluster the dataset using only the data, we can then compare the clusters identified by the algorithm with the original labels. Remember that one of the first questions we must ask before trying a supervised learning task is whether there is truly a pattern in the data, and clustering algorithms allow us to answer this question.\n#\n# To do this we will take the entire dataset, use a clustering algorithm to identify cluster labels for each sample in the dataset, and then try to compare the labels to the true labels.\n#\n# ### K-Means Clustering\n#\n# One of the most commonly-used clustering algorithms is __k-means clustering__, because it is fast and relatively easy to understand. The algorithm is as follows:\n#\n# 1. Select $k$ random samples from the dataset to be the initial means\n# 2. Assign a label to each sample in the dataset according to the mean which is nearest to it\n# 3. Update each mean to be the centroid of the samples in its cluster\n# 4. Repeat steps 2 and 3 until the cluster labels converge\n#\n# Here's an example from Wikipedia of k-means in action:\n#\n# \"kmeans\"\n#\n# This algorithm does seem to take the same approach to clustering that k-nearest neighbors takes to classification. In fact, it kind of has the same hyperparameters:\n# 1. The number of clusters $k$\n# 2. The distance function, which is used to find the nearest mean to a sample\n#\n# As with kNN, we will use Euclidean distance by default. As for the number of clusters $k$, since we already know that the Iris dataset has three classes, we will use $k = 3$.\n\n\n# load Iris dataset\niris = sklearn.datasets.load_iris()\nX = iris.data\ny = iris.target\n\n# initialize k-means model\nkmeans = sklearn.cluster.KMeans(n_clusters=3, n_jobs=-1)\n\n# fit the model to the dataset\nkmeans.fit(X)\n\n# compute cluster labels for the dataset\ny_pred = kmeans.predict(X)\n\n# show a side-by-side comparison of cluster labels and true labels\nprint(np.c_[y, y_pred])\n\n\n# It looks like the clustering model was able to cluster the dataset into the three classes, but there's one problem: the cluster indices don't necessarily match up to the class indices. In other words, the clustering model doesn't know what each cluster _is_, it only knows how the data points are clustered. What we need is a way to measure the _similarity_ of the cluster labels and the ground truth labels by accounting for permutations. One metric which provides this kind of measure is the __adjusted Rand index (ARI)__:\n\n\nari = sklearn.metrics.adjusted_rand_score(y, y_pred)\n\nprint(\"%0.3f\" % (ari))\n\n\n# The ARI ranges from -1 to +1, where +1 is perfect similarity. We can also use the scatter plots we developed in the \"Working with Data\" notebook to visually compare the cluster labels with the ground truth labels:\n\n\n# define function to plot a slice of Iris dataset\ndef plot_iris_2d(iris, columns, labels):\n # extract x and y axes\n x, y = iris.data[:, columns[0]], iris.data[:, columns[1]]\n\n # plot x and y\n plt.scatter(x, y, c=labels)\n plt.xlabel(iris.feature_names[columns[0]])\n plt.ylabel(iris.feature_names[columns[1]])\n\n\n# create side-by-side comparison of cluster labels and true labels\nplt.subplots(1, 2, figsize=(10, 5))\n\nplt.subplot(121)\nplot_iris_2d(iris, [0, 2], y_pred)\nplt.title(\"Cluster Labels\")\n\nplt.subplot(122)\nplot_iris_2d(iris, [0, 2], iris.target)\nplt.title(\"Ground Truth Labels\")\n\nplt.show()\n\n# TODO: change the value of k and observe its effect on the ARI and the scatter plots\n\n\n# Now we can see where the clustering model \"mis-clustered\" data points. Note that the colors will match up only if the clustering model happened to assign clusters in the same order as the ground truth labels.\n#\n# It seems like clustering is pretty easy when we have the ground truth labels. But what if we didn't? What if Ronald Fisher had neglected to label each flower that he measured? We would then have just a set of 150 flower measurements with no idea of the species of each flower. Could we use clustering to determine the species? The problem is that in this scenario we don't know the number of species -- in other words, we don't know the value of $k$. We could do a hyperparameter search on $k$, but how would we compare each model? The ARI won't help us here because, again, we don't have the ground truth labels.\n#\n# In the absence of ground truth labels, the primary evaluation metric that we can use for k-means is __inertia__, or within-class scatter. The inertia is the sum of the variance in each cluster; a lower inertia generally corresponds to more coherent clusters, so we will seek a value of $k$ which minimizes the inertia:\n\n\n# evaluate k-means for several values of k\nk_values = range(1, 11)\n\nfor k in k_values:\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n y_pred = model.fit_predict(X)\n\n print(\"k = %2d: %0.2f\" % (k, model.inertia_))\n\n\n# Hmm... it seems like the inertia just keeps decreasing as $k$ increases. Well, the largest possible value of $k$ is the number of data points, so let's try that:\n\n\nk = iris.data.shape[0]\nmodel = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\ny_pred = model.fit_predict(X)\n\nprint(\"k = %2d: %0.2f\" % (k, model.inertia_))\n\n\n# If we assume that there are as many clusters as there are data points, we effectively assign each data point to its own cluster. Since each cluster then has only one data point, the variance of each cluster is 0 and so the total variance, or inertia, is also 0. But this result is not helpful to us. It turns out that the inertia has an inherent __bias__ toward more complex models. The most common way to deal with this bias is to use what's called the __elbow method__, which is best explained with a plot:\n\n\ndef evaluate_kmeans(k):\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n model.fit_predict(X)\n\n return model.inertia_\n\n\nx = range(1, 21)\ny = [evaluate_kmeans(k) for k in x]\n\nplt.plot(x, y)\nplt.xticks(x)\nplt.show()\n\n\n# As the plot shows, even though the inertia decreases indefinitely, it also begins to level off, creating an \"elbow\". The elbow method takes the value of $k$ at which this elbow occurs to be the best value of $k$. The idea is that this value gives us the best \"bang for our buck\" -- the simplest model at which the minimum inertia occurs (more or less). But where exactly does the elbow occur? Is it 3? Or 4? Unfortunately, the elbow method isn't an exact method. But can we really be upset? After all, without ground truth labels, the number of clusters in a dataset is quite subjective:\n\n\nk_values = [2, 3, 4, 5]\n\nplt.subplots(1, len(k_values), figsize=(5 * len(k_values), 5))\n\nfor i in range(len(k_values)):\n k = k_values[i]\n model = sklearn.cluster.KMeans(n_clusters=k, n_jobs=-1)\n labels = model.fit_predict(X)\n\n plt.subplot(1, len(k_values), i + 1)\n plot_iris_2d(iris, [0, 2], labels)\n plt.title(\"k = %d\" % (k))\n\nplt.show()\n\n\n# How many clusters do you see? Can you tell which plot is the most \"correct\"? Clustering becomes much more difficult when we don't have ground truth labels or the number of clusters, because we don't have an objective way to select the best model. This issue pervades virtually every clustering algorithm in existence: __what is the right number of clusters for a dataset__? The philosophical answer is that there is no such thing! Clustering is unsupervised learning; in other words, we don't know what we're looking for. The best we can do is to hand-craft our own metrics based on ideas like inertia, and to use what hints we can get from the task we are trying to do.\n#\n# For a more in-depth overview of k-means and the many other clustering algorithms out there, we refer you to the [scikit-learn documentation](http://scikit-learn.org/stable/modules/clustering.html#clustering). Feel free to try out some of these algorithms yourself on the Iris dataset! A good algorithm to try first is the __Gaussian mixture model__, which is very similar to k-means.\n\n# ## Dimensionality Reduction: Digits Dataset\n#\n# We have seen many times by now that datasets tend to have a lot of features, almost always more than 2 or 3, which prevents us from being able to visualize them wholistically. Furthermore, as the number of features in a dataset increases, it becomes harder to extract useful information from the data, to separate \"signal\" from \"noise\", so to speak. We call this phenomenon the __curse of dimensionality__. It is especially true for image datasets, since images can easily contain thousands of pixels. Dimensionality reduction techniques can help us with both of these problems: we can transform a dataset into 2 or 3 dimensions for __visualization__, or more generally we can perform __feature extraction__, in which we transform a dataset into some lower-dimensional space that should contain more signal and less noise.\n#\n# To present these techniques we will use the digits dataset provided by scikit-learn. This dataset consists of 8x8 grayscale images of handwritten digits. We'll go ahead and load the dataset:\n\n\n# load the digits dataset\ndigits = sklearn.datasets.load_digits()\n\n# print dataset stats\nprint(\"X: (%d, %d)\" % digits.data.shape)\nprint(\"y: (%d,)\" % digits.target.shape)\nprint(\"label names:\", digits.target_names)\n\n\nsns.heatmap(digits.images[0], cmap=\"Greys\")\nplt.show()\n\n\n# These images are very small, and they use only 16 shades of grey. Later on we will look at another dataset of handwritten digits which is much more widely-known, and much larger. But for now this dataset will suffice.\n\n# ### Principal Component Analysis (PCA)\n#\n# PCA is a classic dimensionality reduction technique. It has been used for all sorts of things, including visualization, pattern recognition, data compression... and so on. PCA computes the __principal components__ of a dataset. In mathematical terms, the principal components of a dataset $X$ are the eigenvectors of the covariance of $X$:\n#\n# $$W_{pca} = V,$$\n# $$\\Sigma = X X^T = V U V^T$$\n#\n# Intuitively, the principal components of a dataset are the axes along which the variance of the dataset is maximized. They are essentially just features, based on the original features of the dataset. A dataset can have as many principal components as it has features, but we typically only take the $N$ most relevant principal components; this truncation is how we reduce the dimensionality of the data. In particular, we use the (truncated) principal component matrix to project each sample $\\vec{x}$ into a lower-dimensional space:\n#\n# $$\\vec{x}_{proj} = W_{pca} \\vec{x}$$\n#\n# Here's a real-world example. You're in a classroom filled with students, and you want to create a system that can distinguish between each individual student using a set of features. You can use whatever features you want: gender, age, height, weight, skin color, hair color, beard, glasses, clothing, nationality... anything. In this situation, the most __salient__ or useful features would be the ones for which there is a lot of variation between individuals; for example, gender wouldn't be a very useful feature in a room full of guys, but nationality would be very useful in a room full of people from many different nations. In other words, you want the features which exhibit the greatest __variance__ in your dataset of people. PCA attempts to find these features, except that it is not limited to the features themselves; it can also compute new features by using linear combinations of the original features.\n#\n# Now let's try to understand how PCA works with some code:\n\n\n# get the data matrix from the digits dataset\nX = digits.data\n\n# create PCA model\npca = sklearn.decomposition.PCA()\n\n# compute principal components of X\npca.fit(X)\n\n# print stats\nprint(pca.components_.shape)\n\n\n# plot the explained variance of each principal component\nx = range(pca.n_components_)\ny = pca.explained_variance_ratio_\n\nplt.plot(x, y)\nplt.title(\"Explained Variance of Principal Components\")\nplt.xlabel(\"Component\")\nplt.ylabel(\"Explained Variance Ratio\")\nplt.show()\n\n\n# In the above code we simply compute the principal components of the digits dataset and we plot the explained variance of each component. Components with the highest explained variance are the \"salient features\" that we were talking about; in this case, the first principal component contains nearly 15% of the variance of the entire dataset. Notice that the components are sorted by their explained variance, which makes it easy to take the $n$ components with the most variance:\n\n\n# plot the total variance contained within the first n components\nx = range(pca.n_components_)\ny = [100 * sum(pca.explained_variance_ratio_[:n]) for n in x]\n\nplt.plot(x, y)\nplt.title(\"Total Explained Variance of Principal Components\")\nplt.xlabel(\"Number of Components\")\nplt.ylabel(\"Total Explained Variance (%)\")\nplt.show()\n\n\n# Earlier we presented the idea of separating the signal from the noise in a dataset; variance is one of the simplest ways to determine where such signal occurs. Much of the useful information in a dataset is in its variations; if we can capture most of a dataset's variance with only a subset of features, we can essentially filter out much of the noise without losing very much signal. In this example, we can take the first 20 components, which reduces the dataset from 64 dimensions to 20 dimensions, and still retain roughly 90% of the information in the dataset. This reduction is useful from two perspectives: on the one hand, we have found a more compact representation of the data (sort of like lossy compression), and on the other hand, we have found features that may be more useful for tasks such as classification.\n#\n# In fact, let's try it -- let's create a classifier and see if using PCA as a preprocessing step improves classification accuracy:\n\n\n# define a function to evaluate a kNN classifier (with optional PCA)\ndef evaluate_model(X_train, X_test, y_train, y_test, n_pca=0):\n # create the model\n model = None\n\n if n_pca != 0:\n # if n_pca is specified then create a PCA/kNN model\n model = sklearn.pipeline.Pipeline([\n (\"pca\", sklearn.decomposition.PCA(n_components=n_pca)),\n (\"knn\", sklearn.neighbors.KNeighborsClassifier(1))\n ])\n else:\n # otherwise just use kNN\n model = sklearn.neighbors.KNeighborsClassifier(1)\n\n # train the model\n model.fit(X_train, y_train)\n\n # evaluate the model\n return model.score(X_test, y_test)\n\n\n# create train and test sets\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n digits.data, digits.target, test_size=0.3)\n\n# compare the accuracy of several classifiers by varying the number of principal components\n# a value of 0 means don't use PCA\npca_values = [1, 5, 10, 20, 64, 0]\n\nfor n_pca in pca_values:\n accuracy = 100 * evaluate_model(X_train, X_test, y_train, y_test, n_pca)\n\n print(\"Accuracy (n_pca = %2d): %5.1f%%\" % (n_pca, accuracy))\n\n\n# Well, we didn't really _improve_ the accuracy. The thing is that this dataset is simple enough that kNN can do quite well on its own; to really test the benefit of PCA we would need a more complex dataset, one where kNN doesn't get 98% accuracy out of the box. The more important result here is that _we maintained the same level of accuracy using fewer dimensions_ -- the first 20 principal components provides the same level of classification potential as the original 64 features.\n#\n# We can also use principal components to visualize the dataset. Remember, principal components are like axes, so we can pick any two principal components and project each data point onto them:\n\n\n# select two principal components by index\nindices = [0, 1]\n\n# project each data point onto the selected axes\nX_proj = pca.transform(X)\n\nx, y = X_proj[:, indices[0]], X_proj[:, indices[1]]\n\n# plot the projected data\nplt.scatter(x, y, c=digits.target, cmap=\"hsv\")\nplt.xlabel(\"Principal Component %d\" % (indices[0]))\nplt.ylabel(\"Principal Component %d\" % (indices[1]))\nplt.show()\n\n# TODO: try to find the pair of principal components that best separate the colors\n\n\n# We went ahead and colored each data points by class. Remember that this dataset consists of images of handwritten digits, so each data point is an image and the class denotes which digit it is. But even with the first two components, the 10 classes are still pretty mixed up. We could separate the classes pretty well if we could use more components -- we know this from the classification experiment that we just ran -- but when we're limited to only two components it's hard to get good class separation. For this reason, PCA is generally pretty limited as a visualization tool, unless the first two components have a very high explained variance.\n#\n# ### t-distributed Stochastic Neighbor Embedding (t-SNE)\n#\n# When it comes to visualizing high-dimensional data, t-SNE is hands-down the most widely-used tool, and arguably the most effective. It is also way more complicated mathematically so we won't even begin to try and explain it. Just know that it can take your high-dimensional dataset and magically put it into 2D or 3D space. Observe:\n\n\n# get the data matrix from the digits dataset\nX = digits.data\n", "project_metadata": {"full_name": "CUFCTL/creative-inquiry", "description": "Repository for FCTL creative inquiries", "topics": [], "git_url": "git://github.com/CUFCTL/creative-inquiry.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-04-19T14:19:33Z", "size": 1403, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 323042, "TeX": 5727}, "last_updated": "2020-12-09T18:24:40Z"}, "intent": "# compute t-SNE of X"}, {"original_comment": "# 1.15 how many times 'Nantucket Nectar' ordered\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Variables management\n# %who: list of existed variable (just name)\n# %whos: ... with type and value\n# del: delete 1 variable (delete memory), different with \"None-value variable\"\n#a = 5\n#a = None\n#del a\n\n\n# ***Name: Nguy\u1ec5n \u0110\u00ecnh Ph\u01b0\u01a1ng***\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\n#%%\n\ndf = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\open_food_facts.csv', low_memory=False)\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.tail()\n\n#%%\n\n# 1.2\nprint(df.shape[0])\n# Value in index 545 (start from 1) is not valid\n\n#%%\n\n# 1.3 Print name of all columns\ndf.columns\n\n#%%\n\n# 1.4 name of 99th column\ndf.columns[98]\n\n#%%\n\n# 1.5 summarize the dataframe\ndf.describe()\n\n#%%\n\n# 1.7 distinct creator in dataset\nlen(set(df.creator))\n\n#%%\n\nproduct = df.dropna(subset=['product_name'])\n# 1.8 the most product item in product_name column\nx = product['product_name'].value_counts().idxmax()\ny = product['product_name'].value_counts().max()\nprint(x, y)\n# modes = df.mode()\n# modes['product_name'][0]\n\n#%%\n\n# just a research note\ndk = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],\n [np.nan, np.nan, np.nan, 5]],\n columns=list('ABCD'))\ndk.dropna(subset=['A'])\n\n#%%\n\n# 1.9 product_name in total\nlen(set(product['product_name']))\n\n#%%\n\n# count or check if nan in column\ndf.countries_en.isnull().sum()\n\n#%%\n\n# 1.10 the most brand item\nbrand = df.dropna(subset=['brands'])\nbrand['brands'].value_counts().head(4)\n\n#%%\n\n# 1.11 different countries\n# don't have nan\nlen(set(df['countries_en']))\n\n#%%\n\n# 1.12 what product with most [energy-from-fat_100ml]\nkf = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\open_food_facts.tsv', sep='\\t', low_memory=False)\n# h\u00ecnh nh\u01b0 kh\u00f4ng c\u00f3 d\u1eef li\u1ec7u c\u1ed9t energy-from-fat_100ml\n\n#%%\n\n# 1.13 sort by product name\nproduct.sort_values(by='product_name')['product_name']\n\n#%%\n\ndf2 = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\chipotle.tsv', sep='\\t', low_memory=False)\ndf2.head()\n\n#%%\n\n# 1.14 show price each item\nname = df2.drop_duplicates(subset='item_name')\nname[['item_name', 'item_price']]\n\n#%%", "target_code": "df2[df2['item_name'] == 'Nantucket Nectar']['order_id'].count()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Variables management\n# %who: list of existed variable (just name)\n# %whos: ... with type and value\n# del: delete 1 variable (delete memory), different with \"None-value variable\"\n#a = 5\n#a = None\n#del a\n\n\n# ***Name: Nguy\u1ec5n \u0110\u00ecnh Ph\u01b0\u01a1ng***\n\n\nimport numpy as np\nimport pandas as pd\n\n\ndf = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\open_food_facts.csv', low_memory=False)\n\n\ndf.head()\n\n\ndf.tail()\n\n\n# 1.2\nprint(df.shape[0])\n# Value in index 545 (start from 1) is not valid\n\n\n# 1.3 Print name of all columns\ndf.columns\n\n\n# 1.4 name of 99th column\ndf.columns[98]\n\n\n# 1.5 summarize the dataframe\ndf.describe()\n\n\n# 1.7 distinct creator in dataset\nlen(set(df.creator))\n\n\nproduct = df.dropna(subset=['product_name'])\n# 1.8 the most product item in product_name column\nx = product['product_name'].value_counts().idxmax()\ny = product['product_name'].value_counts().max()\nprint(x, y)\n# modes = df.mode()\n# modes['product_name'][0]\n\n\n# just a research note\ndk = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],\n [np.nan, np.nan, np.nan, 5]],\n columns=list('ABCD'))\ndk.dropna(subset=['A'])\n\n\n# 1.9 product_name in total\nlen(set(product['product_name']))\n\n\n# count or check if nan in column\ndf.countries_en.isnull().sum()\n\n\n# 1.10 the most brand item\nbrand = df.dropna(subset=['brands'])\nbrand['brands'].value_counts().head(4)\n\n\n# 1.11 different countries\n# don't have nan\nlen(set(df['countries_en']))\n\n\n# 1.12 what product with most [energy-from-fat_100ml]\nkf = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\open_food_facts.tsv', sep='\\t', low_memory=False)\n# h\u00ecnh nh\u01b0 kh\u00f4ng c\u00f3 d\u1eef li\u1ec7u c\u1ed9t energy-from-fat_100ml\n\n\n# 1.13 sort by product name\nproduct.sort_values(by='product_name')['product_name']\n\n\ndf2 = pd.read_csv(\n r'G:\\Google Drive-Storage\\Home_Pratice\\chipotle.tsv', sep='\\t', low_memory=False)\ndf2.head()\n\n\n# 1.14 show price each item\nname = df2.drop_duplicates(subset='item_name')\nname[['item_name', 'item_price']]\n\n", "project_metadata": {"full_name": "duyet/JVN-Basic-Python-R-Training", "description": "Basic Python/R for JVN Pre-master", "topics": [], "git_url": "git://github.com/duyet/JVN-Basic-Python-R-Training.git", "stars": 4, "watchers": 4, "forks": 6, "created": "2017-08-10T14:36:47Z", "size": 37030, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5345510, "TeX": 556132}, "last_updated": "2020-04-30T12:31:03Z"}, "intent": "# how many times 'Nantucket Nectar' ordered"}, {"original_comment": "# Adding the input layerand the LSTM layer\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#\n\n#%%\n\n# Importing Libraries\nimport random\nfrom keras.layers import LSTM\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n#%%\n\n# Importing dataset\ntrain = pd.read_csv('../input/train_1.csv').fillna(0)\npage = train['Page']\ntrain.head()\n\n#%%\n\n# Dropping Page Column\ntrain = train.drop('Page', axis=1)\n\n#%%\n\n# Using Data From Random Row for Training and Testing\n\nrow = train.iloc[90000, :].values\nX = row[0:549]\ny = row[1:550]\n\n# Splitting the dataset into the Training set and Test set\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.3, random_state=0)\n\n\n# Feature Scaling\nsc = MinMaxScaler()\nX_train = np.reshape(X_train, (-1, 1))\ny_train = np.reshape(y_train, (-1, 1))\nX_train = sc.fit_transform(X_train)\ny_train = sc.fit_transform(y_train)\n\n#%%\n\n# Training LSTM\n\n# Reshaping Array\nX_train = np.reshape(X_train, (384, 1, 1))\n\n\n# Importing the Keras libraries and packages for LSTM\n\n# Initialising the RNN\nregressor = Sequential()\n\n# Adding the input layerand the LSTM layer\nregressor.add(LSTM(units=8, activation='relu', input_shape=(None, 1)))\n\n\n# Adding the output layer\nregressor.add(Dense(units=1))\n\n# Compiling the RNN\nregressor.compile(optimizer='adam', loss='mean_squared_error')\n\n# Fitting the RNN to the Training set\nregressor.fit(X_train, y_train, batch_size=10, epochs=100, verbose=0)\n\n#%%\n\n# Getting the predicted Web View\ninputs = X_test\ninputs = np.reshape(inputs, (-1, 1))\ninputs = sc.transform(inputs)\ninputs = np.reshape(inputs, (165, 1, 1))\ny_pred = regressor.predict(inputs)\ny_pred = sc.inverse_transform(y_pred)\n\n#%%\n\n# Visualising Result\nplt.figure\nplt.plot(y_test, color='red', label='Real Web View')\nplt.plot(y_pred, color='blue', label='Predicted Web View')\nplt.title('Web View Forecasting')\nplt.xlabel('Number of Days from Start')\nplt.ylabel('Web View')\nplt.legend()\nplt.show()\n\n#%%\n\n# As you can see the prediction is quite accurate for a test set. Now repeat this for some other rows\n\n#%%\n\nrow = train.iloc[0, :].values\nX = row[0:549]\ny = row[1:550]\n\n# Splitting the dataset into the Training set and Test set\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.3, random_state=0)\n\n\n# Feature Scaling\nsc = MinMaxScaler()\nX_train = np.reshape(X_train, (-1, 1))\ny_train = np.reshape(y_train, (-1, 1))\nX_train = sc.fit_transform(X_train)\ny_train = sc.fit_transform(y_train)\n\n# Training LSTM\n\n# Reshaping Array\nX_train = np.reshape(X_train, (384, 1, 1))\n\n\n# Importing the Keras libraries and packages for LSTM\n\n# Initialising the RNN\nregressor = Sequential()", "target_code": "regressor.add(LSTM(units=8, activation='relu', input_shape=(None, 1)))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#\n\n\n# Importing Libraries\nimport random\nfrom keras.layers import LSTM\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\n# Importing dataset\ntrain = pd.read_csv('../input/train_1.csv').fillna(0)\npage = train['Page']\ntrain.head()\n\n\n# Dropping Page Column\ntrain = train.drop('Page', axis=1)\n\n\n# Using Data From Random Row for Training and Testing\n\nrow = train.iloc[90000, :].values\nX = row[0:549]\ny = row[1:550]\n\n# Splitting the dataset into the Training set and Test set\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.3, random_state=0)\n\n\n# Feature Scaling\nsc = MinMaxScaler()\nX_train = np.reshape(X_train, (-1, 1))\ny_train = np.reshape(y_train, (-1, 1))\nX_train = sc.fit_transform(X_train)\ny_train = sc.fit_transform(y_train)\n\n\n# Training LSTM\n\n# Reshaping Array\nX_train = np.reshape(X_train, (384, 1, 1))\n\n\n# Importing the Keras libraries and packages for LSTM\n\n# Initialising the RNN\nregressor = Sequential()\n\n# Adding the input layerand the LSTM layer\nregressor.add(LSTM(units=8, activation='relu', input_shape=(None, 1)))\n\n\n# Adding the output layer\nregressor.add(Dense(units=1))\n\n# Compiling the RNN\nregressor.compile(optimizer='adam', loss='mean_squared_error')\n\n# Fitting the RNN to the Training set\nregressor.fit(X_train, y_train, batch_size=10, epochs=100, verbose=0)\n\n\n# Getting the predicted Web View\ninputs = X_test\ninputs = np.reshape(inputs, (-1, 1))\ninputs = sc.transform(inputs)\ninputs = np.reshape(inputs, (165, 1, 1))\ny_pred = regressor.predict(inputs)\ny_pred = sc.inverse_transform(y_pred)\n\n\n# Visualising Result\nplt.figure\nplt.plot(y_test, color='red', label='Real Web View')\nplt.plot(y_pred, color='blue', label='Predicted Web View')\nplt.title('Web View Forecasting')\nplt.xlabel('Number of Days from Start')\nplt.ylabel('Web View')\nplt.legend()\nplt.show()\n\n\n# As you can see the prediction is quite accurate for a test set. Now repeat this for some other rows\n\n\nrow = train.iloc[0, :].values\nX = row[0:549]\ny = row[1:550]\n\n# Splitting the dataset into the Training set and Test set\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.3, random_state=0)\n\n\n# Feature Scaling\nsc = MinMaxScaler()\nX_train = np.reshape(X_train, (-1, 1))\ny_train = np.reshape(y_train, (-1, 1))\nX_train = sc.fit_transform(X_train)\ny_train = sc.fit_transform(y_train)\n\n# Training LSTM\n\n# Reshaping Array\nX_train = np.reshape(X_train, (384, 1, 1))\n\n\n# Importing the Keras libraries and packages for LSTM\n\n# Initialising the RNN\nregressor = Sequential()\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# Add input layer and LSTM layer"}, {"original_comment": "# now we can drop the case_label column\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ---\n\n# # Data Cleaning\n\n# First of all we should get the metadata dataset with unique identifiers, we shall relate the main subject and countries on separate tables\n#\n# ---\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.style.use('ggplot')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Merging judgements and orders\n\n#%%\n\nget_ipython().system('ls -la ../data/*')\n\n#%%\n\ndf_o = pd.read_csv('../data/orders_metadata.csv', encoding='Latin-1')\ndf_j = pd.read_csv('../data/judgements_metadata.csv',\n encoding='Latin-1') # utf8 expected characters\ndf_j = df_j[list(df_o.columns.values)] # order the columns\ndf_j['case_type'] = 'Judgement'\ndf_o['case_type'] = 'Order'\nraw = df_j.append(df_o, ignore_index=True)\n\n#%%\n\nprint(len(raw), len(raw['source'].unique()))\n\n\n# Since the lenght of the dataset isn't the same as the unique identifiers means that we have to deal with the main_subject\n\n# Write the main_subjects file\n\n#%%\n\n#df[['source','main_subject']].to_csv('../data/main_subjects.csv', index=False)\n\n\n# Grouping the unique cases by celex number and re-merge them with the original set\n\n#%%\n\ndef get_grouped_df(df, identifier, groupedby):\n subjects = df.groupby(identifier).count()[groupedby]\n df.index = df[identifier]\n df = df.loc[:, df.columns != groupedby]\n df = df[~df.index.duplicated(keep='first')]\n assert len(subjects) == len(df)\n df = df.join(subjects, how='left').reset_index(drop=True)\n return df\n\n#%%\n\ndf = get_grouped_df(raw, 'source', 'main_subject')\n\n#%%\n\nprint(len(df))\ndf.head()\n\n\n# test of uniqueness\n\n#%%\n\ndf[df['source'] == '61976CJ0003'] # the case has 4 main subjects\n\n#%%\n\ndef missing_values_table(df):\n mis_val = df.isnull().sum()\n mis_val_percent = 100 * df.isnull().sum()/len(df)\n mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n mis_val_table = mis_val_table.rename(\n columns={0: 'Missing Values', 1: '% of Total Values'})\n return mis_val_table\n\n#%%\n\nmissing = missing_values_table(df)\n# showing only all those are not zero\nmissing = missing[missing['Missing Values'] > 0]\nprint(missing)\n\n#%%\n\nplt.figure(figsize=(10, 4))\nplt.barh(range(len(missing)), missing['% of Total Values'],\n align='center', color='#003399', alpha=0.5, label='not_specified')\nplt.yticks(range(len(missing)), missing.index)\nplt.title('% Missing Values')\nplt.show()\n\n#%%\n\ndef convert_nan(df, columns):\n for column in columns:\n df[str(column)] = ['not_specified' if str(i) ==\n 'nan' else i for i in df[str(column)]]\n return df\n\n#%%\n\ndf = convert_nan(df, ['judge', 'advocate', 'chamber', 'country'])\n\n#%%\n\n# change all the nulls for not specified except date\nprint(missing_values_table(df)[missing_values_table(df)['Missing Values'] > 0])\n\n\n# ### String to string columns\n\n# At this point everything except dates are strings, the NaN are float and it's difficult to handle it\n\n#%%\n\ndef to_str(df):\n columns = [i for i in df.columns.values]\n for column in columns:\n df[str(column)] = df[str(column)].apply(lambda x: str(x))\n return df\n\n#%%\n\ndf = to_str(df)\n\n#%%\n\n# test\n#columns = [i for i in df.columns.values]\n# for c in columns:\n# print(set([type(i) for i in df[str(c)]]))\n\n\n# ### Handling dates and time\n\n#%%\n\ndf['lodge_date'] = pd.to_datetime(df['lodge_date'], format='%d/%m/%Y')\ndf['document_date'] = pd.to_datetime(df['document_date'], format='%d/%m/%Y')\n\n#%%\n\ndf['year_document'] = pd.DatetimeIndex(df['document_date']).year\ndf['month_document'] = pd.DatetimeIndex(df['document_date']).month\ndf['year_lodge'] = pd.DatetimeIndex(df['lodge_date']).year\ndf['month_lodge'] = pd.DatetimeIndex(df['lodge_date']).month\n\n\n# Cases with no date\n\n#%%\n\ndf[df['lodge_date'].isnull()]\n\n\n# For this case we gop to eulex=62014CJ0049, to paste 23/01/2014 manually\n\n#%%\n\ndf.loc[df['source'] == '62014CJ0049', 'lodge_date'] = pd.to_datetime(\n '23/01/2014', format='%d/%m/%Y')\n\n#%%\n\ndf[df['document_date'].isnull()]\n\n\n# Creating a new feature, the size in time of the case\n\n#%%\n\ndf['case_time'] = df['document_date'] - df['lodge_date']\n\n#%%\n\ndf['case_time'].head(3) # we expect them to be all positives\n\n\n# Check out the exceptions\n\n#%%\n\ndf[df['case_time'] < pd.Timedelta('0 days')]\n\n\n# There's one case, and in eulex=61969CJ0074, to paste 1969-12-04 manually\n\n#%%\n\ndf.loc[df['source'] == '61969CJ0074', 'lodge_date'] = pd.to_datetime(\n '04/12/1969', format='%d/%m/%Y')\n\n#%%\n\n# re-run case_time since we add the correct date\ndf['case_time'] = df['document_date'] - df['lodge_date']\n\n#%%\n\ndf[df['case_time'] < pd.Timedelta('0 days')]\n\n\n# ### Number of countries and corrections\n\n# Create a column counting the number of countries involved in the case\n\n#%%\n\ndf['n_countries'] = df['country'].apply(\n lambda x: len(x.split(';')) if x != 'nan' else 0)\n\n#%%\n\n# test\ndf[df['source'] == '62000CJ0204'] # we expect 4 countries\n\n\n# We need a manual check to see what are the wrong cases in all those where there is only one country involved\n\n#%%\n\ndf[df['n_countries'] == 1]['country'].sort_values().unique()\n\n\n# It looks like there's one advocate in the country column, fortunately it's the only case, and we re-arange them manually\n\n#%%\n\ndf[df['country'] == 'La Pergola'][['country', 'judge', 'advocate']]\n\n#%%\n\ndf.loc[df['country'] == 'La Pergola', 'judge'] = 'La Pergola'\ndf.loc[df['country'] == 'La Pergola', 'advocate'] = 'Jacobs'\n\n#%%\n\n# Incorrect cases\ncases = {\n ('Provisional data', 'not_specified'),\n ('NLD', 'Netherlands'),\n ('La Pergola', 'not_specified'),\n ('GBR', 'United Kingdom'),\n ('FRA', 'France'),\n ('FIN', 'Finland'),\n ('DEU', 'Germany'),\n ('BEL', 'Belgium'),\n ('XX', 'not_specified'),\n ('USA', 'United States')\n}\n\n#%%\n\ndef find_replace(l, cases):\n for a, b in cases:\n l = [row.replace(a, b) for row in l]\n return l\n\n#%%\n\ndf['country'] = find_replace(df['country'], cases)\n\n#%%\n\ndf[df['country'] == 'FRA'] # test country\n\n\n# ---\n\n# ### Countries Table\n\n# Now we need to create an independent dataset which enlist and relates the cases with the unique countries\n\n#%%\n\ndef create_countries(celex, country, n_c):\n # vector of countries raw sorted by cases\n country_vec = [row.split(';') for row in country]\n country_vec = [item for sublist in country_vec for item in sublist]\n country_vec = [i if i[0] != ' ' else i[1:] for i in country_vec]\n # vector of celex ids raw sorted by cases\n celex_vec = []\n for idx, n in enumerate(n_c):\n if n == 0:\n celex_vec.append([celex[idx]])\n else:\n celex_vec.append([celex[idx] for k in range(n)])\n celex_vec = [item for sublist in celex_vec for item in sublist]\n assert len(celex_vec) == len(country_vec), 'Not equal lenght'\n return pd.DataFrame({'source': celex_vec, 'country': country_vec})\n\n#%%\n\ndf_countries = create_countries(df['source'], df['country'], df['n_countries'])\n\n#%%\n\ndf_countries.head()\n\n#%%\n\n#df_countries.to_csv('../data/countries.csv', index=False)\n\n\n# All the wrong cases are replaced\n\n# ### Optimizing labels\n\n#%%\n\ndf['joined_cases'] = [1 if i.find(\n 'oin') != -1 else 0 for i in df['case_label']]\n\n#%%\n\ndf['ecli'] = [i[22:] for i in df['ecli']]\n\n\n# ---\n# ### Chamber country\n\n#%%\n\ndf['chamber'].unique()\n\n\n# After check in eulex manually\n\n#%%\n\n# Incorrect cases\ncases2 = {\n ('512032', 'Sixth Chamber'),\n ('512031', 'Sixth Chamber'),\n ('sixi\u00e8me chambre', 'Sixth chamber'),\n ('sixi\u00c3\u00a8me chambre', 'Sixth chamber'),\n ('as amended by Order of 10 July 1975', 'First chamber')\n}\n\n#%%\n\ndf['chamber'] = find_replace(df['chamber'], cases2)\n\n#%%\n\ndf[df['chamber'] == '512032'] # test chamber\n\n#%%\n\ndf['country-chamber'] = df['country']+'-'+df['chamber']\n\n#%%\n\n#sorted(df[df['n_countries'] == 1]['country-chamber'].unique())\n\n#%%\n\nyears = sorted(df['year_document'].unique())\nnan_chamber = []\nfor year in years:\n group = df[df['year_document'] == year].groupby('chamber').count()\n nan_chamber.append(group.loc['not_specified']\n ['source']/group['source'].sum()*100)\n\n#%%\n\nplt.figure(figsize=(18, 6))\nposition = range(len(years))\nplt.bar(position, nan_chamber, align='center',\n alpha=0.8, label='not_specified')\nplt.xticks(position, years, rotation=90)\nplt.legend(loc='best')\nplt.title('% of Null Chamber over the years')\nplt.show()\n\n\n# ---\n\n# ## Validation of columns in rulings\n\n# This step is additional, since the columns related with the rulings where mixed up, what we wnat at the end of the day it's to merge the two parts of the case\n\n# Applying the same methodology\n\n#%%\n\ndf_or = pd.read_csv('../data/orders_ruling.csv', encoding='utf-8')\ndf_jr = pd.read_csv('../data/judgements_ruling.csv', encoding='utf-8')\ndf_jr = df_jr[list(df_or.columns.values)] # order the columns\nraw_r = df_jr.append(df_or, ignore_index=True)\nraw_r['c8'] = raw_r['c1']\n\n#%%\n\nprint(len(raw_r), len(raw_r['c1'].unique()))\n\n#%%\n\ndf_r = get_grouped_df(raw_r, 'c1', 'c8')\n\n\n# The columns ruling_title are correct and naturally the sources (c1,c2)\n\n#%%\n\ndf_r = df_r[['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7']]\ndf_r.columns = ['source', 'ruling_title', 'ruling_name',\n 'ruling_type', 'ruling_content', 'case_label', 'extra']\n\n#%%\n\nprint(len(df_r))\ndf_r.head()\n\n#%%\n\ndf_r = convert_nan(df_r, ['ruling_title', 'ruling_type',\n 'ruling_content', 'case_label', 'extra'])\n\n\n# We follow this process to check not specified values in the columns and the and the swaping of places\n# - From the last column to the initial\n# - Check nulls of the column\n# - Check if there's elements of the 'column-1' in column\n# - Check if there's elements of the 'column-2' in column\n# - Repeat for all the columns till ends\n\n# ---\n# ### Extra column check\n\n# - Null values in other column\n\n#%%\n\ndf_r[df_r['extra'] != 'not_specified']\n\n#%%\n\n# correct 62011CJ0363\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_name'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_type'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_type'])[0]\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_content'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_content'])[0]\n\n#%%\n\ndf_r.loc[df_r['source'] == '62011CJ0363', [\n 'ruling_name', 'ruling_type', 'ruling_content']]\n\n#%%\n\n# correct 62011CJ0363\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_name'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_type'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_type'])[0]\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_content'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_content'])[0]\n\n#%%\n\ndf_r.loc[df_r['source'] == '62015CO0462', [\n 'ruling_name', 'ruling_type', 'ruling_content']]\n\n#%%\n\ndf_r = df_r.drop(['extra'], axis=1)\n\n\n# ---\n# ### Case label column check\n\n# - Null values in case_label column\n\n#%%\n\n_slice = df_r[df_r['case_label'] == 'not_specified']\n\n\n# we have to move one column to the left\n\n#%%\n\n_slice = _slice.reset_index(drop=True)\nprint(len(_slice))\n_slice.head()\n\n#%%\n\n_slice['case_label'] = list(_slice.loc[:, ('ruling_content')])\n_slice['ruling_content'] = list(_slice.loc[:, ('ruling_type')])\n_slice['ruling_type'] = list(_slice.loc[:, ('ruling_name')])\n\n\n# We take them out from the original code, then we remerge them\n\n#%%\n\ndf_r = df_r.drop(df_r[df_r['source'].isin(list(_slice['source']))].index)\ndf_r = df_r.append(_slice).reset_index(drop=True)\n\n\n# - We have to repeat the moving to the left\n\n#%%\n\n_slice2 = df_r[df_r['case_label'] == 'not_specified']\n_slice2 = _slice2.reset_index(drop=True)\nprint(len(_slice2))\n_slice2.head()\n\n#%%\n\n_slice2['case_label'] = list(_slice2.loc[:, ('ruling_content')])\n_slice2['ruling_content'] = list(_slice2.loc[:, ('ruling_type')])\n\n#%%\n\ndf_r = df_r.drop(df_r[df_r['source'].isin(list(_slice2['source']))].index)\ndf_r = df_r.append(_slice2).reset_index(drop=True)\n\n#%%\n\ndf_r[df_r['case_label'] == 'not_specified']\n\n#%%\n\n# they are in the raw, althought just name\n#df.loc[(df['source'] == '62008CJ0202')|(df['source'] == '62008CO0561')]\n\n#%%\n\n# correct cases\ndf_r.loc[df_r['source'] == '62008CJ0202', 'ruling_name'] = list(\n df.loc[df['source'] == '62008CJ0202', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62008CO0561', 'ruling_name'] = list(\n df.loc[df['source'] == '62008CO0561', 'ruling_name'])[0]\n\n#%%", "target_code": "df_r = df_r.drop(['case_label'], axis=1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ---\n\n# # Data Cleaning\n\n# First of all we should get the metadata dataset with unique identifiers, we shall relate the main subject and countries on separate tables\n#\n# ---\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.style.use('ggplot')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Merging judgements and orders\n\n\nget_ipython().system('ls -la ../data/*')\n\n\ndf_o = pd.read_csv('../data/orders_metadata.csv', encoding='Latin-1')\ndf_j = pd.read_csv('../data/judgements_metadata.csv',\n encoding='Latin-1') # utf8 expected characters\ndf_j = df_j[list(df_o.columns.values)] # order the columns\ndf_j['case_type'] = 'Judgement'\ndf_o['case_type'] = 'Order'\nraw = df_j.append(df_o, ignore_index=True)\n\n\nprint(len(raw), len(raw['source'].unique()))\n\n\n# Since the lenght of the dataset isn't the same as the unique identifiers means that we have to deal with the main_subject\n\n# Write the main_subjects file\n\n\n#df[['source','main_subject']].to_csv('../data/main_subjects.csv', index=False)\n\n\n# Grouping the unique cases by celex number and re-merge them with the original set\n\n\ndef get_grouped_df(df, identifier, groupedby):\n subjects = df.groupby(identifier).count()[groupedby]\n df.index = df[identifier]\n df = df.loc[:, df.columns != groupedby]\n df = df[~df.index.duplicated(keep='first')]\n assert len(subjects) == len(df)\n df = df.join(subjects, how='left').reset_index(drop=True)\n return df\n\n\ndf = get_grouped_df(raw, 'source', 'main_subject')\n\n\nprint(len(df))\ndf.head()\n\n\n# test of uniqueness\n\n\ndf[df['source'] == '61976CJ0003'] # the case has 4 main subjects\n\n\ndef missing_values_table(df):\n mis_val = df.isnull().sum()\n mis_val_percent = 100 * df.isnull().sum()/len(df)\n mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n mis_val_table = mis_val_table.rename(\n columns={0: 'Missing Values', 1: '% of Total Values'})\n return mis_val_table\n\n\nmissing = missing_values_table(df)\n# showing only all those are not zero\nmissing = missing[missing['Missing Values'] > 0]\nprint(missing)\n\n\nplt.figure(figsize=(10, 4))\nplt.barh(range(len(missing)), missing['% of Total Values'],\n align='center', color='#003399', alpha=0.5, label='not_specified')\nplt.yticks(range(len(missing)), missing.index)\nplt.title('% Missing Values')\nplt.show()\n\n\ndef convert_nan(df, columns):\n for column in columns:\n df[str(column)] = ['not_specified' if str(i) ==\n 'nan' else i for i in df[str(column)]]\n return df\n\n\ndf = convert_nan(df, ['judge', 'advocate', 'chamber', 'country'])\n\n\n# change all the nulls for not specified except date\nprint(missing_values_table(df)[missing_values_table(df)['Missing Values'] > 0])\n\n\n# ### String to string columns\n\n# At this point everything except dates are strings, the NaN are float and it's difficult to handle it\n\n\ndef to_str(df):\n columns = [i for i in df.columns.values]\n for column in columns:\n df[str(column)] = df[str(column)].apply(lambda x: str(x))\n return df\n\n\ndf = to_str(df)\n\n\n# test\n#columns = [i for i in df.columns.values]\n# for c in columns:\n# print(set([type(i) for i in df[str(c)]]))\n\n\n# ### Handling dates and time\n\n\ndf['lodge_date'] = pd.to_datetime(df['lodge_date'], format='%d/%m/%Y')\ndf['document_date'] = pd.to_datetime(df['document_date'], format='%d/%m/%Y')\n\n\ndf['year_document'] = pd.DatetimeIndex(df['document_date']).year\ndf['month_document'] = pd.DatetimeIndex(df['document_date']).month\ndf['year_lodge'] = pd.DatetimeIndex(df['lodge_date']).year\ndf['month_lodge'] = pd.DatetimeIndex(df['lodge_date']).month\n\n\n# Cases with no date\n\n\ndf[df['lodge_date'].isnull()]\n\n\n# For this case we gop to eulex=62014CJ0049, to paste 23/01/2014 manually\n\n\ndf.loc[df['source'] == '62014CJ0049', 'lodge_date'] = pd.to_datetime(\n '23/01/2014', format='%d/%m/%Y')\n\n\ndf[df['document_date'].isnull()]\n\n\n# Creating a new feature, the size in time of the case\n\n\ndf['case_time'] = df['document_date'] - df['lodge_date']\n\n\ndf['case_time'].head(3) # we expect them to be all positives\n\n\n# Check out the exceptions\n\n\ndf[df['case_time'] < pd.Timedelta('0 days')]\n\n\n# There's one case, and in eulex=61969CJ0074, to paste 1969-12-04 manually\n\n\ndf.loc[df['source'] == '61969CJ0074', 'lodge_date'] = pd.to_datetime(\n '04/12/1969', format='%d/%m/%Y')\n\n\n# re-run case_time since we add the correct date\ndf['case_time'] = df['document_date'] - df['lodge_date']\n\n\ndf[df['case_time'] < pd.Timedelta('0 days')]\n\n\n# ### Number of countries and corrections\n\n# Create a column counting the number of countries involved in the case\n\n\ndf['n_countries'] = df['country'].apply(\n lambda x: len(x.split(';')) if x != 'nan' else 0)\n\n\n# test\ndf[df['source'] == '62000CJ0204'] # we expect 4 countries\n\n\n# We need a manual check to see what are the wrong cases in all those where there is only one country involved\n\n\ndf[df['n_countries'] == 1]['country'].sort_values().unique()\n\n\n# It looks like there's one advocate in the country column, fortunately it's the only case, and we re-arange them manually\n\n\ndf[df['country'] == 'La Pergola'][['country', 'judge', 'advocate']]\n\n\ndf.loc[df['country'] == 'La Pergola', 'judge'] = 'La Pergola'\ndf.loc[df['country'] == 'La Pergola', 'advocate'] = 'Jacobs'\n\n\n# Incorrect cases\ncases = {\n ('Provisional data', 'not_specified'),\n ('NLD', 'Netherlands'),\n ('La Pergola', 'not_specified'),\n ('GBR', 'United Kingdom'),\n ('FRA', 'France'),\n ('FIN', 'Finland'),\n ('DEU', 'Germany'),\n ('BEL', 'Belgium'),\n ('XX', 'not_specified'),\n ('USA', 'United States')\n}\n\n\ndef find_replace(l, cases):\n for a, b in cases:\n l = [row.replace(a, b) for row in l]\n return l\n\n\ndf['country'] = find_replace(df['country'], cases)\n\n\ndf[df['country'] == 'FRA'] # test country\n\n\n# ---\n\n# ### Countries Table\n\n# Now we need to create an independent dataset which enlist and relates the cases with the unique countries\n\n\ndef create_countries(celex, country, n_c):\n # vector of countries raw sorted by cases\n country_vec = [row.split(';') for row in country]\n country_vec = [item for sublist in country_vec for item in sublist]\n country_vec = [i if i[0] != ' ' else i[1:] for i in country_vec]\n # vector of celex ids raw sorted by cases\n celex_vec = []\n for idx, n in enumerate(n_c):\n if n == 0:\n celex_vec.append([celex[idx]])\n else:\n celex_vec.append([celex[idx] for k in range(n)])\n celex_vec = [item for sublist in celex_vec for item in sublist]\n assert len(celex_vec) == len(country_vec), 'Not equal lenght'\n return pd.DataFrame({'source': celex_vec, 'country': country_vec})\n\n\ndf_countries = create_countries(df['source'], df['country'], df['n_countries'])\n\n\ndf_countries.head()\n\n\n#df_countries.to_csv('../data/countries.csv', index=False)\n\n\n# All the wrong cases are replaced\n\n# ### Optimizing labels\n\n\ndf['joined_cases'] = [1 if i.find(\n 'oin') != -1 else 0 for i in df['case_label']]\n\n\ndf['ecli'] = [i[22:] for i in df['ecli']]\n\n\n# ---\n# ### Chamber country\n\n\ndf['chamber'].unique()\n\n\n# After check in eulex manually\n\n\n# Incorrect cases\ncases2 = {\n ('512032', 'Sixth Chamber'),\n ('512031', 'Sixth Chamber'),\n ('sixi\u00e8me chambre', 'Sixth chamber'),\n ('sixi\u00c3\u00a8me chambre', 'Sixth chamber'),\n ('as amended by Order of 10 July 1975', 'First chamber')\n}\n\n\ndf['chamber'] = find_replace(df['chamber'], cases2)\n\n\ndf[df['chamber'] == '512032'] # test chamber\n\n\ndf['country-chamber'] = df['country']+'-'+df['chamber']\n\n\n#sorted(df[df['n_countries'] == 1]['country-chamber'].unique())\n\n\nyears = sorted(df['year_document'].unique())\nnan_chamber = []\nfor year in years:\n group = df[df['year_document'] == year].groupby('chamber').count()\n nan_chamber.append(group.loc['not_specified']\n ['source']/group['source'].sum()*100)\n\n\nplt.figure(figsize=(18, 6))\nposition = range(len(years))\nplt.bar(position, nan_chamber, align='center',\n alpha=0.8, label='not_specified')\nplt.xticks(position, years, rotation=90)\nplt.legend(loc='best')\nplt.title('% of Null Chamber over the years')\nplt.show()\n\n\n# ---\n\n# ## Validation of columns in rulings\n\n# This step is additional, since the columns related with the rulings where mixed up, what we wnat at the end of the day it's to merge the two parts of the case\n\n# Applying the same methodology\n\n\ndf_or = pd.read_csv('../data/orders_ruling.csv', encoding='utf-8')\ndf_jr = pd.read_csv('../data/judgements_ruling.csv', encoding='utf-8')\ndf_jr = df_jr[list(df_or.columns.values)] # order the columns\nraw_r = df_jr.append(df_or, ignore_index=True)\nraw_r['c8'] = raw_r['c1']\n\n\nprint(len(raw_r), len(raw_r['c1'].unique()))\n\n\ndf_r = get_grouped_df(raw_r, 'c1', 'c8')\n\n\n# The columns ruling_title are correct and naturally the sources (c1,c2)\n\n\ndf_r = df_r[['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7']]\ndf_r.columns = ['source', 'ruling_title', 'ruling_name',\n 'ruling_type', 'ruling_content', 'case_label', 'extra']\n\n\nprint(len(df_r))\ndf_r.head()\n\n\ndf_r = convert_nan(df_r, ['ruling_title', 'ruling_type',\n 'ruling_content', 'case_label', 'extra'])\n\n\n# We follow this process to check not specified values in the columns and the and the swaping of places\n# - From the last column to the initial\n# - Check nulls of the column\n# - Check if there's elements of the 'column-1' in column\n# - Check if there's elements of the 'column-2' in column\n# - Repeat for all the columns till ends\n\n# ---\n# ### Extra column check\n\n# - Null values in other column\n\n\ndf_r[df_r['extra'] != 'not_specified']\n\n\n# correct 62011CJ0363\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_name'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_type'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_type'])[0]\ndf_r.loc[df_r['source'] == '62011CJ0363', 'ruling_content'] = list(\n df.loc[df['source'] == '62011CJ0363', 'ruling_content'])[0]\n\n\ndf_r.loc[df_r['source'] == '62011CJ0363', [\n 'ruling_name', 'ruling_type', 'ruling_content']]\n\n\n# correct 62011CJ0363\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_name'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_type'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_type'])[0]\ndf_r.loc[df_r['source'] == '62015CO0462', 'ruling_content'] = list(\n df.loc[df['source'] == '62015CO0462', 'ruling_content'])[0]\n\n\ndf_r.loc[df_r['source'] == '62015CO0462', [\n 'ruling_name', 'ruling_type', 'ruling_content']]\n\n\ndf_r = df_r.drop(['extra'], axis=1)\n\n\n# ---\n# ### Case label column check\n\n# - Null values in case_label column\n\n\n_slice = df_r[df_r['case_label'] == 'not_specified']\n\n\n# we have to move one column to the left\n\n\n_slice = _slice.reset_index(drop=True)\nprint(len(_slice))\n_slice.head()\n\n\n_slice['case_label'] = list(_slice.loc[:, ('ruling_content')])\n_slice['ruling_content'] = list(_slice.loc[:, ('ruling_type')])\n_slice['ruling_type'] = list(_slice.loc[:, ('ruling_name')])\n\n\n# We take them out from the original code, then we remerge them\n\n\ndf_r = df_r.drop(df_r[df_r['source'].isin(list(_slice['source']))].index)\ndf_r = df_r.append(_slice).reset_index(drop=True)\n\n\n# - We have to repeat the moving to the left\n\n\n_slice2 = df_r[df_r['case_label'] == 'not_specified']\n_slice2 = _slice2.reset_index(drop=True)\nprint(len(_slice2))\n_slice2.head()\n\n\n_slice2['case_label'] = list(_slice2.loc[:, ('ruling_content')])\n_slice2['ruling_content'] = list(_slice2.loc[:, ('ruling_type')])\n\n\ndf_r = df_r.drop(df_r[df_r['source'].isin(list(_slice2['source']))].index)\ndf_r = df_r.append(_slice2).reset_index(drop=True)\n\n\ndf_r[df_r['case_label'] == 'not_specified']\n\n\n# they are in the raw, althought just name\n#df.loc[(df['source'] == '62008CJ0202')|(df['source'] == '62008CO0561')]\n\n\n# correct cases\ndf_r.loc[df_r['source'] == '62008CJ0202', 'ruling_name'] = list(\n df.loc[df['source'] == '62008CJ0202', 'ruling_name'])[0]\ndf_r.loc[df_r['source'] == '62008CO0561', 'ruling_name'] = list(\n df.loc[df['source'] == '62008CO0561', 'ruling_name'])[0]\n\n", "project_metadata": {"full_name": "MaastrichtU-IDS/case-law-analysis", "description": "Applied Legal Analytics on European Union Cases", "topics": ["applied-legal-analytics", "web-scraping"], "git_url": "git://github.com/MaastrichtU-IDS/case-law-analysis.git", "stars": 5, "watchers": 5, "forks": 2, "created": "2018-04-13T15:38:47Z", "size": 135989, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3476360, "Python": 8559, "C++": 6876}, "last_updated": "2020-10-05T21:22:03Z"}, "intent": "# drop the case_label column"}, {"original_comment": "# Take 10,000 samples out of the binomial distribution: n_defaults\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## **Statistical Thinking in Python (Part 1)**\n#\n# **Course Description**\n#\n# After all of the hard work of acquiring data and getting them into a form you can work with, you ultimately want to make clear, succinct conclusions from them. This crucial last step of a data analysis pipeline hinges on the principles of statistical inference. In this course, you will start building the foundation you need to think statistically, to speak the language of your data, to understand what they are telling you. The foundations of statistical thinking took decades upon decades to build, but they can be grasped much faster today with the help of computers. With the power of Python-based tools, you will rapidly get up to speed and begin thinking statistically by the end of this course.\n\n# **Imports**\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.patches import Rectangle\nimport numpy as np\nfrom pprint import pprint as pp\nimport csv\nfrom pathlib import Path\nimport seaborn as sns\nfrom scipy.stats import binom\n\nfrom sklearn.datasets import load_iris\n\n\n# **Pandas Configuration Options**\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# **Data Files Location**\n#\n# * Most data files for the exercises can be found on the [course site](https://www.datacamp.com/courses/statistical-thinking-in-python-part-1)\n# * [2008 election results (all states)](https://assets.datacamp.com/production/repositories/469/datasets/8fb59b9a99957c3b9b1c82b623aea54d8ccbcd9f/2008_all_states.csv)\n# * [2008 election results (swing states)](https://assets.datacamp.com/production/repositories/469/datasets/e079fddb581197780e1a7b7af2aeeff7242535f0/2008_swing_states.csv)\n# * [Belmont Stakes](https://assets.datacamp.com/production/repositories/469/datasets/7507bfed990379f246b4f166ea8a57ecf31c6c9d/belmont.csv)\n# * [Speed of light](https://assets.datacamp.com/production/repositories/469/datasets/df23780d215774ff90be0ea93e53f4fb5ebbade8/michelson_speed_of_light.csv)\n\n# **Data File Objects**\n\n#%%\n\ndata = Path.cwd() / 'data' / 'statistical_thinking_1'\nelections_all_file = data / '2008_all_states.csv'\nelections_swing_file = data / '2008_swing_states.csv'\nbelmont_file = data / 'belmont.csv'\nsol_file = data / 'michelson_speed_of_light.csv'\n\n\n# **Iris Data Set**\n\n#%%\n\niris = load_iris()\niris_df = pd.DataFrame(data=np.c_[\n iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])\n\n\ndef iris_typing(x):\n types = {0.0: 'setosa',\n 1.0: 'versicolour',\n 2.0: 'virginica'}\n return types[x]\n\n\niris_df['species'] = iris_df.target.apply(iris_typing)\niris_df.head()\n\n\n# # Graphical exploratory data analysis\n#\n# Look before you leap! A very important proverb, indeed. Prior to diving in headlong into sophisticated statistical inference techniques, you should first explore your data by plotting them and computing simple summary statistics. This process, called exploratory data analysis, is a crucial first step in statistical analysis of data. So it is a fitting subject for the first chapter of Statistical Thinking in Python.\n\n# ## Introduction to exploratory data analysis\n#\n# * Exploring the data is a crucial step of the analysis.\n# * Organizing\n# * Plotting\n# * Computing numerical summaries\n# * This idea is known as exploratory data analysis (EDA)\n# * \"Exploratory data analysis can never be the whole story, but nothing else can serve as the foundation stone.\" - [John Tukey](https://en.wikipedia.org/wiki/John_Tukey)\n\n#%%\n\nswing = pd.read_csv(elections_swing_file)\nswing.head()\n\n\n# * The raw data isn't particularly informative\n# * We could start computing parameters and their confidence intervals and do hypothesis test...\n# * ...however, we should graphically explore the data first\n\n# ### Tukey's comments on EDA\n#\n# Even though you probably have not read Tukey's book, I suspect you already have a good idea about his viewpoint from the video introducing you to exploratory data analysis. Which of the following quotes is not directly from Tukey?\n#\n# 1. Exploratory data analysis is detective work.\n# 1. There is no excuse for failing to plot and look.\n# 1. The greatest value of a picture is that it forces us to notice what we never expected to see.\n# 1. It is important to understand what you can do before you learn how to measure how well you seem to have done it.\n# 1. ~~**Often times EDA is too time consuming, so it is better to jump right in and do your hypothesis tests.**~~\n\n# ### Advantages of graphical EDA\n#\n# Which of the following is not true of graphical EDA?\n#\n# 1. It often involves converting tabular data into graphical form.\n# 1. If done well, graphical representations can allow for more rapid interpretation of data.\n# 1. ~~**A nice looking plot is always the end goal of a statistical analysis.**~~\n# 1. There is no excuse for neglecting to do graphical EDA.\n\n# ## Plotting a histogram\n#\n# * always label the axes\n\n#%%\n\nbin_edges = [x for x in range(0, 110, 10)]\nplt.hist(x=swing.dem_share, bins=bin_edges, edgecolor='black')\nplt.xticks(bin_edges)\nplt.yticks(bin_edges[:-1])\nplt.xlabel('Percent of vote for Obama')\nplt.ylabel('Number of Counties')\nplt.show()\n\n\n# **Seaborn**\n\n#%%\n\nsns.set()\n\n#%%\n\nplt.hist(x=swing.dem_share)\nplt.xlabel('Percent of vote for Obama')\nplt.ylabel('Number of Counties')\nplt.show()\n\n\n# ### Plotting a histogram of iris data\n#\n# For the exercises in this section, you will use a classic data set collected by botanist Edward Anderson and made famous by Ronald Fisher, one of the most prolific statisticians in history. Anderson carefully measured the anatomical properties of samples of three different species of iris, *Iris setosa*, *Iris versicolor*, and *Iris virginica*. The full data set is [available as part of scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html). Here, you will work with his measurements of petal length.\n#\n# Plot a histogram of the petal lengths of his 50 samples of Iris versicolor using matplotlib/seaborn's default settings. Recall that to specify the default seaborn style, you can use `sns.set()`, where `sns` is the alias that `seaborn` is imported as.\n#\n# The subset of the data set containing the Iris versicolor petal lengths in units of centimeters (cm) is stored in the NumPy array `versicolor_petal_length`.\n#\n# In the video, Justin plotted the histograms by using the `pandas` library and indexing the DataFrame to extract the desired column. Here, however, you only need to use the provided NumPy array. Also, Justin assigned his plotting statements (except for `plt.show()`) to the dummy variable `_`. This is to prevent unnecessary output from being displayed. It is not required for your solutions to these exercises, however it is good practice to use it. Alternatively, if you are working in an interactive environment such as a Jupyter notebook, you could use a `;` after your plotting statements to achieve the same effect. Justin prefers using `_`. Therefore, you will see it used in the solution code.\n#\n# **Instructions**\n#\n# * Import `matplotlib.pyplot` and `seaborn` as their usual aliases (`plt` and `sns`).\n# * Use `seaborn` to set the plotting defaults.\n# * Plot a histogram of the Iris versicolor petal lengths using `plt.hist()` and the provided NumPy array `versicolor_petal_length`.\n# * Show the histogram using `plt.show()`.\n\n#%%\n\nversicolor_petal_length = iris_df['petal length (cm)'][iris_df.species ==\n 'versicolour']\n\n#%%\n\nplt.hist(versicolor_petal_length)\nplt.show()\n\n\n# ### Axis labels!\n#\n# In the last exercise, you made a nice histogram of petal lengths of Iris versicolor, but **you didn't label the axes!** That's ok; it's not your fault since we didn't ask you to. Now, add axis labels to the plot using `plt.xlabel()` and `plt.ylabel()`. Don't forget to add units and assign both statements to `_`. The packages `matplotlib.pyplot` and `seaborn` are already imported with their standard aliases. This will be the case in what follows, unless specified otherwise.\n#\n# **Instructions**\n#\n# * Label the axes. Don't forget that you should always include units in your axis labels. Your y-axis label is just `'count'`. Your x-axis label is `'petal length (cm)'`. The units are essential!\n# * Display the plot constructed in the above steps using `plt.show()`.\n\n#%%\n\nplt.hist(versicolor_petal_length)\nplt.xlabel('petal length (cm)')\nplt.ylabel('count')\nplt.show()\n\n\n# ### Adjusting the number of bins in a histogram\n#\n# The histogram you just made had ten bins. This is the default of matplotlib. The \"square root rule\" is a commonly-used rule of thumb for choosing number of bins: choose the number of bins to be the square root of the number of samples. Plot the histogram of Iris versicolor petal lengths again, this time using the square root rule for the number of bins. You specify the number of bins using the `bins` keyword argument of `plt.hist()`.\n#\n# The plotting utilities are already imported and the seaborn defaults already set. The variable you defined in the last exercise, `versicolor_petal_length`, is already in your namespace.\n#\n# **Instructions**\n#\n# * Import `numpy` as `np`. This gives access to the square root function, `np.sqrt()`.\n# * Determine how many data points you have using `len()`.\n# * Compute the number of bins using the square root rule.\n# * Convert the number of bins to an integer using the built in `int()` function.\n# * Generate the histogram and make sure to use the `bins` keyword argument.\n# * Hit 'Submit Answer' to plot the figure and see the fruit of your labors!\n\n#%%\n\n# Compute number of data points: n_data\nn_data = len(versicolor_petal_length)\n\n# Number of bins is the square root of number of data points: n_bins\nn_bins = np.sqrt(n_data)\n\n# Convert number of bins to integer: n_bins\nn_bins = int(n_bins)\n\n# Plot the histogram\n_ = plt.hist(versicolor_petal_length, bins=n_bins)\n\n# Label axes\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('count')\n\n# Show histogram\nplt.show()\n\n\n# ## Plotting all of your data: Bee swarm plots\n#\n# * Binning Bias: The same data may be interpreted differently depending on choice of bins\n# * Additionally, all of the data isn't being plotted; the precision of the actual data is lost in the bins\n# * These issues can be resolved with swarm plots\n# * Point position along the y-axis is the quantitative information\n# * The data are spread in x to make them visible, but their precise location along the x-axis is unimportant\n# * No binning bias and all the data are displayed.\n# * Seaborn & Pandas\n\n#%%\n\nsns.swarmplot(x='state', y='dem_share', data=swing)\nplt.xlabel('state')\nplt.ylabel('percent of vote for Obama')\nplt.title('% of Vote per Swing State County')\nplt.show()\n\n\n# ### Bee swarm plot\n#\n# Make a bee swarm plot of the iris petal lengths. Your x-axis should contain each of the three species, and the y-axis the petal lengths. A data frame containing the data is in your namespace as `df`.\n#\n# For your reference, the code Justin used to create the bee swarm plot in the video is provided below:\n#\n# ```python\n# _ = sns.swarmplot(x='state', y='dem_share', data=df_swing)\n# _ = plt.xlabel('state')\n# _ = plt.ylabel('percent of vote for Obama')\n# plt.show()\n# ```\n#\n# In the IPython Shell, you can use `sns.swarmplot`? or `help(sns.swarmplot)` for more details on how to make bee swarm plots using seaborn.\n#\n# **Instructions**\n#\n# * In the IPython Shell, inspect the DataFrame `df` using `df.head()`. This will let you identify which column names you need to pass as the `x` and `y` keyword arguments in your call to `sns.swarmplot()`.\n# * Use `sns.swarmplot()` to make a bee swarm plot from the DataFrame containing the Fisher iris data set, `df`. The x-axis should contain each of the three species, and the y-axis should contain the petal lengths.\n# * Label the axes.\n# * Show your plot.\n\n#%%\n\nsns.swarmplot(x='species', y='petal length (cm)', data=iris_df)\nplt.xlabel('species')\nplt.ylabel('petal length (cm)')\nplt.show()\n\n\n# ### Interpreting a bee swarm plot\n#\n# Which of the following conclusions could you draw from the bee swarm plot of iris petal lengths you generated in the previous exercise? For your convenience, the bee swarm plot is regenerated and shown to the right.\n#\n# **Instructions**\n#\n# Possible Answers\n# 1. All I. versicolor petals are shorter than I. virginica petals.\n# 1. I. setosa petals have a broader range of lengths than the other two species.\n# 1. __**I. virginica petals tend to be the longest, and I. setosa petals tend to be the shortest of the three species.**__\n# 1. I. versicolor is a hybrid of I. virginica and I. setosa.\n\n# ## Plotting all of your data: Empirical cumulative distribution functions\n#\n# * [Empirical Distribution Function](https://en.wikipedia.org/wiki/Empirical_distribution_function)\n# * [Empirical Distribution Function / Empirical CDF](https://www.statisticshowto.datasciencecentral.com/empirical-distribution-function/)\n# * An **empirical cumulative distribution function** (also called the empirical distribution function, **ECDF**, or just **EDF**) and a [cumulative distribution function](https://www.statisticshowto.datasciencecentral.com/cumulative-distribution-function/) are basically the same thing; they are both probability models for data. While ***a CDF is a hypothetical model of a distribution***, **the ECDF models empirical (i.e. observed) data**. To put this another way, __**the ECDF is the [probability distribution](https://www.statisticshowto.datasciencecentral.com/probability-distribution/) you would get if you sampled from your [sample](https://www.statisticshowto.datasciencecentral.com/sample/), instead of the [population](https://www.statisticshowto.datasciencecentral.com/what-is-a-population/)**__. Lets say you have a set of experimental (observed) data $x_{1},x_{2},\\,\\ldots\\,x_{n}$. The EDF will give you the fraction of sample observations less than or equal to a particular value of $x$.\n# * More formally, if you have a set of [order statistics](https://www.statisticshowto.datasciencecentral.com/order-statistics/) ($y_{1}\", color='b'))\n\nax.annotate('75% of counties had < 0.5 vote for Obama', xy=(50, .75),\n xytext=(55, 0.6), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='b'))\n\nplt.show()\n\n\n# #### plot multiple ECDFs\n\n#%%\n\nfig, ax = plt.subplots(figsize=(10, 5))\nax.margins(0.05) # Default margin is 0.05, value 0 means fit\n\nfor state in swing.state.unique():\n x = np.sort(swing['dem_share'][swing.state == state])\n y = np.arange(1, len(x)+1) / len(x)\n ax.plot(x, y, marker='.', linestyle='none', label=state)\n\nplt.xlabel('percent of vote for Obama')\nplt.ylabel('ECDF')\nplt.legend()\n\nplt.show()\n\n\n# ### Computing the ECDF\n#\n# In this exercise, you will write a function that takes as input a 1D array of data and then returns the `x` and `y` values of the ECDF. You will use this function over and over again throughout this course and its sequel. ECDFs are among the most important plots in statistical analysis. You can write your own function, `foo(x,y)` according to the following skeleton:\n#\n# ```python\n# def foo(a,b):\n# \"\"\"State what function does here\"\"\"\n# # Computation performed here\n# return x, y\n# ```\n#\n# The function `foo()` above takes two arguments `a` and `b` and returns two values `x` and `y`. The function header `def foo(a,b):` contains the function signature `foo(a,b)`, which consists of the function name, along with its parameters. For more on writing your own functions, see [DataCamp's course Python Data Science Toolbox (Part 1)](https://www.datacamp.com/courses/python-data-science-toolbox-part-1)!\n#\n# **Instructions**\n#\n# * Define a function with the signature `ecdf(data)`. Within the function definition,\n# * Compute the number of data points, `n`, using the `len()` function.\n# * The **x**-values are the sorted data. Use the `np.sort()` function to perform the sorting.\n# * The **y** data of the ECDF go from `1/n` to `1` in equally spaced increments. You can construct this using `np.arange()`. Remember, however, that the end value in `np.arange()` is not inclusive. Therefore, `np.arange()` will need to go from `1` to `n+1`. Be sure to divide this by `n`.\n# * The function returns the values `x` and `y`.\n\n# #### def ecdf()\n\n#%%\n\ndef ecdf(data):\n \"\"\"Compute ECDF for a one-dimensional array of measurements.\"\"\"\n # Number of data points: n\n n = len(data)\n\n # x-data for the ECDF: x\n x = np.sort(data)\n\n # y-data for the ECDF: y\n y = np.arange(1, n+1) / n\n\n return x, y\n\n\n# ### Plotting the ECDF\n#\n# You will now use your `ecdf()` function to compute the ECDF for the petal lengths of Anderson's *Iris versicolor* flowers. You will then plot the ECDF. Recall that your `ecdf()` function returns two arrays so you will need to unpack them. An example of such unpacking is `x, y = foo(data)`, for some function `foo()`.\n#\n# **Instructions**\n#\n# * Use `ecdf()` to compute the ECDF of `versicolor_petal_length`. Unpack the output into `x_vers` and `y_vers`.\n# * Plot the ECDF as dots. Remember to include `marker = '.'` and `linestyle = 'none'` in addition to `x_vers` and `y_vers` as arguments inside `plt.plot()`.\n# * Label the axes. You can label the y-axis `'ECDF'`.\n# * Show your plot.\n\n#%%\n\n# Compute ECDF for versicolor data: x_vers, y_vers\nx, y = ecdf(versicolor_petal_length)\n\n# Generate plot\nplt.plot(x, y, marker='.', linestyle='none')\n\n# Label the axes\nplt.xlabel('Versicolor Petal Length (cm)')\nplt.ylabel('ECDF')\n\n# Display the plot\nplt.margins(0.02) # keep data off plot edges\nplt.show()\n\n\n# ### Comparison of ECDFs\n#\n# ECDFs also allow you to compare two or more distributions (though plots get cluttered if you have too many). Here, you will plot ECDFs for the petal lengths of all three iris species. You already wrote a function to generate ECDFs so you can put it to good use!\n#\n# To overlay all three ECDFs on the same plot, you can use `plt.plot()` three times, once for each ECDF. Remember to include `marker='.'` and `linestyle='none'` as arguments inside `plt.plot()`.\n#\n# **Instructions**\n#\n# * Compute ECDFs for each of the three species using your `ecdf()` function. The variables `setosa_petal_length`, `versicolor_petal_length`, and `virginica_petal_length` are all in your namespace. Unpack the ECDFs into `x_set`, `y_set`, `x_vers`, `y_vers` and `x_virg`, `y_virg`, respectively.\n# * Plot all three ECDFs on the same plot as dots. To do this, you will need three `plt.plot()` commands. Assign the result of each to `_`.\n# * A legend and axis labels have been added for you, so hit 'Submit Answer' to see all the ECDFs!\n\n#%%\n\nvirginica_petal_length = iris_df['petal length (cm)'][iris_df.species ==\n 'virginica']\nsetosa_petal_length = iris_df['petal length (cm)'][iris_df.species == 'setosa']\n\n# Compute ECDFs\nx_set, y_set = ecdf(setosa_petal_length)\nx_vers, y_vers = ecdf(versicolor_petal_length)\nx_virg, y_virg = ecdf(virginica_petal_length)\n\n# Plot all ECDFs on the same plot\nplt.plot(x_set, y_set, marker='.', linestyle='none')\nplt.plot(x_vers, y_vers, marker='.', linestyle='none')\nplt.plot(x_virg, y_virg, marker='.', linestyle='none')\n\n# Annotate the plot\nplt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('ECDF')\n\n# Display the plot\nplt.show()\n\n\n# ## Onward toward the whole story\n#\n# * Start with graphical eda!\n#\n# **Coming up...**\n#\n# * Thinking probabilistically\n# * Discrete and continuous distributions\n# * The power of hacker statistics using np.random()\n\n# # Quantitative exploratory data analysis\n#\n# In the last chapter, you learned how to graphically explore data. In this chapter, you will compute useful summary statistics, which serve to concisely describe salient features of a data set with a few numbers.\n\n# ## Introduction to summary statistics: The sample mean and median\n#\n# * mean - average\n# * heavily influenced by outliers\n# * `np.mean()`\n# * median - middle value of the sorted dataset\n# * immune to outlier influence\n# * `np.median()`\n\n# ### Means and medians\n#\n# Which one of the following statements is true about means and medians?\n#\n# **Possible Answers**\n#\n# * ~~An outlier can significantly affect the value of both the mean and the median.~~\n# * **An outlier can significantly affect the value of the mean, but not the median.**\n# * ~~Means and medians are in general both robust to single outliers.~~\n# * ~~The mean and median are equal if there is an odd number of data points.~~\n\n# ### Computing means\n#\n# The mean of all measurements gives an indication of the typical magnitude of a measurement. It is computed using `np.mean()`.\n#\n# **Instructions**\n#\n# * Compute the mean petal length of Iris versicolor from Anderson's classic data set. The variable `versicolor_petal_length` is provided in your namespace. Assign the mean to `mean_length_vers`.\n\n#%%\n\n# Compute the mean: mean_length_vers\nmean_length_vers = np.mean(versicolor_petal_length)\n\n# Print the result with some nice formatting\nprint('I. versicolor:', mean_length_vers, 'cm')\n\n\n# #### with pandas.DataFrame\n\n#%%\n\niris_df.groupby(['species']).mean()\n\n\n# ## Percentiles, outliers and box plots\n#\n# * The median is a special name for the 50th percentile\n# * 50% of the data are less than the median\n# * The 25th percentile is the value of the data point that is greater than 25% of the sorted data\n# * percentiles are useful summary statistics and can be computed using `np.percentile()`\n#\n# **Computing Percentiles**\n#\n# ```python\n# np.percentile(df_swing['dem_share'], [25, 50, 75])\n# ```\n#\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/box_plot.JPG)\n#\n# * Box plots are a graphical methode for displying summary statistics\n# * median is the middle line: 50th percentile\n# * bottom and top line of the box represent the 25th & 75th percentile, repectively\n# * the space between the 25th and 75th percentile is the interquartile range (IQR)\n# * Whiskers extent a distance of 1.5 time the IQR, or the extent of the data, whichever is less extreme\n# * Any points outside the whiskers are plotted as individual points, which we demarcate as outliers\n# * There is no single definition for an outlier, however, being more than 2 IQRs away from the median is a common criterion.\n# * An outlier is not necessarily erroneous\n# * Box plots are a great alternative to bee swarm plots, becasue bee swarm plots become too cluttered with large data sets\n\n#%%\n\nall_states = pd.read_csv(elections_all_file)\nall_states.head()\n\n#%%\n\nsns.boxplot(x='east_west', y='dem_share', data=all_states)\nplt.xlabel('region')\nplt.ylabel('percent of vote for Obama')\nplt.show()\n\n\n# ### Computing percentiles\n#\n# In this exercise, you will compute the percentiles of petal length of *Iris versicolor*.\n#\n# **Instructions**\n#\n# * Create `percentiles`, a NumPy array of percentiles you want to compute. These are the 2.5th, 25th, 50th, 75th, and 97.5th. You can do so by creating a list containing these ints/floats and convert the list to a NumPy array using `np.array()`. For example, `np.array([30, 50])` would create an array consisting of the 30th and 50th percentiles.\n# * Use `np.percentile()` to compute the percentiles of the petal lengths from the Iris versicolor samples. The variable `versicolor_petal_length` is in your namespace.\n\n#%%\n\n# Specify array of percentiles: percentiles\npercentiles = np.array([2.5, 25, 50, 75, 97.5])\n\n# Compute percentiles: ptiles_vers\nptiles_vers = np.percentile(versicolor_petal_length, percentiles)\n\n# Print the result\nptiles_vers\n\n\n# ### Comparing percentiles to ECDF\n#\n# To see how the percentiles relate to the ECDF, you will plot the percentiles of Iris versicolor petal lengths you calculated in the last exercise on the ECDF plot you generated in chapter 1. The percentile variables from the previous exercise are available in the workspace as `ptiles_vers` and `percentiles`.\n#\n# Note that to ensure the Y-axis of the ECDF plot remains between 0 and 1, you will need to rescale the `percentiles` array accordingly - in this case, dividing it by 100.\n#\n# **Instructions**\n#\n# * Plot the percentiles as red diamonds on the ECDF. Pass the x and y co-ordinates - `ptiles_vers` and `percentiles/100` - as positional arguments and specify the `marker='D'`, `color='red'` and `linestyle='none'` keyword arguments. The argument for the y-axis - `percentiles/100` has been specified for you.\n\n#%%\n\n# Plot the ECDF\n_ = plt.plot(x_vers, y_vers, '.')\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('ECDF')\n\n# Overlay percentiles as red diamonds.\n_ = plt.plot(ptiles_vers, percentiles/100, marker='D',\n color='red', linestyle='none')\nplt.show()\n\n\n# ### Box-and-whisker plot\n#\n# Making a box plot for the petal lengths is unnecessary because the iris data set is not too large and the bee swarm plot works fine. However, it is always good to get some practice. Make a box plot of the iris petal lengths. You have a pandas DataFrame, `df`, which contains the petal length data, in your namespace. Inspect the data frame `df` in the IPython shell using `df.head()` to make sure you know what the pertinent columns are.\n#\n# For your reference, the code used to produce the box plot in the video is provided below:\n#\n# ```python\n# _ = sns.boxplot(x='east_west', y='dem_share', data=df_all_states)\n#\n# _ = plt.xlabel('region')\n#\n# _ = plt.ylabel('percent of vote for Obama')\n# ```\n#\n# In the IPython Shell, you can use `sns.boxplot?` or `help(sns.boxplot)` for more details on how to make box plots using seaborn.\n#\n# **Instructions**\n#\n# * The set-up is exactly the same as for the bee swarm plot; you just call `sns.boxplot()` with the same keyword arguments as you would `sns.swarmplot()`. The x-axis is `'species'` and y-axis is `'petal length (cm)'`.\n# * Don't forget to label your axes!\n\n#%%\n\nfig, ax = plt.subplots(figsize=(10, 7))\n# Create box plot with Seaborn's default settings\n_ = sns.boxplot(x='species', y='petal length (cm)', data=iris_df)\n\n# Label the axes\n_ = plt.ylabel('petal length (cm)')\n_ = plt.xlabel('species')\n\n# Show the plot\nplt.show()\n\n\n# ## Variance and standard deviation\n#\n# * measures of spread\n# * variance:\n# * The mean squared distance of the data from the mean\n# * $$variance = \\frac{1}{n}\\sum_{i=1}^{n}(x_{i} - \\overline{x})^2$$\n# * because of the squared quantity, variance doesn't have the same units as the measurement\n# * standard deviation:\n# * $$\\sqrt{variance}$$\n\n# #### Variance\n\n#%%\n\ndem_share_fl = all_states.dem_share[all_states.state == 'FL']\n\n#%%\n\nnp.var(dem_share_fl)\n\n#%%\n\nall_states_var = all_states[['state', 'total_votes', 'dem_votes',\n 'rep_votes', 'other_votes', 'dem_share']].groupby(['state']).var(ddof=0)\nall_states_var.dem_share.loc['FL']\n\n#%%\n\nall_states_var.head()\n\n\n# #### Standard Deviation\n\n#%%\n\nnp.std(dem_share_fl)\n\n#%%\n\nnp.sqrt(np.var(dem_share_fl))\n\n#%%\n\nall_states_std = all_states[['state', 'total_votes', 'dem_votes',\n 'rep_votes', 'other_votes', 'dem_share']].groupby(['state']).std(ddof=0)\nall_states_std.dem_share.loc['FL']\n\n#%%\n\nall_states_std.head()\n\n\n# ### Computing the variance\n#\n# It is important to have some understanding of what commonly-used functions are doing under the hood. Though you may already know how to compute variances, this is a beginner course that does not assume so. In this exercise, we will explicitly compute the variance of the petal length of Iris veriscolor using the equations discussed in the videos. We will then use `np.var()` to compute it.\n#\n# **Instructions**\n#\n# * Create an array called differences that is the `difference` between the petal lengths (`versicolor_petal_length`) and the mean petal length. The variable `versicolor_petal_length` is already in your namespace as a NumPy array so you can take advantage of NumPy's vectorized operations.\n# * Square each element in this array. For example, `x**2` squares each element in the array `x`. Store the result as `diff_sq`.\n# * Compute the mean of the elements in `diff_sq` using `np.mean()`. Store the result as `variance_explicit`.\n# * Compute the variance of `versicolor_petal_length` using `np.var()`. Store the result as `variance_np`.\n# * Print both `variance_explicit` and `variance_np` in one `print` call to make sure they are consistent.\n\n#%%\n\n# Array of differences to mean: differences\ndifferences = versicolor_petal_length - np.mean(versicolor_petal_length)\n\n# Square the differences: diff_sq\ndiff_sq = differences**2\n\n# Compute the mean square difference: variance_explicit\nvariance_explicit = np.mean(diff_sq)\n\n# Compute the variance using NumPy: variance_np\nvariance_np = np.var(versicolor_petal_length)\n\n# Print the results\nprint(variance_explicit, variance_np)\n\n\n# ### The standard deviation and the variance\n#\n# As mentioned in the video, the standard deviation is the square root of the variance. You will see this for yourself by computing the standard deviation using `np.std()` and comparing it to what you get by computing the variance with `np.var()` and then computing the square root.\n#\n# **Instructions**\n#\n# * Compute the variance of the data in the `versicolor_petal_length` array using `np.var()` and store it in a variable called `variance`.\n# * Print the square root of this value.\n# * Print the standard deviation of the data in the `versicolor_petal_length` array using `np.std()`\n\n#%%\n\n# Compute the variance: variance\nvariance = np.var(versicolor_petal_length)\n\n# Print the square root of the variance\nstd_explicit = np.sqrt(variance)\n\n# Print the standard deviation\nstd_np = np.std(versicolor_petal_length)\n\nprint(std_explicit, std_np)\n\n\n# ## Covariance and Pearson correlation coefficient\n#\n# ![covariance](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/covariance.JPG)\n#\n# * Covariance\n# * $$covariance = \\frac{1}{n}\\sum_{i=1}^{n}(x_{i} - \\overline{x})(y_{i} - \\overline{y})$$\n# * The data point differs from the mean vote share and the mean total votes for Obama\n# * The differences for each data point can be computed\n# * The covariance is the mean of the product of these differences\n# * If both x and y tend to be above or below their respective means together, as they are in this data set, the covariance is positive.\n# * This means they are positively correlated:\n# * When x is high, so is y\n# * When the county is populous, it has more votes for Obama\n# * If x is high while y is low, the covariance is negative\n# * This means they are negatively correlated (anticorrelated) - not the case for this data set.\n#\n# ![pearson](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/pearson.JPG)\n#\n# * Pearson correlation\n# * A more generally applicable measure of how two variables depend on each other, should be dimensionless (not units).\n# * $$\\rho = Pearson\\space correlation = \\frac{covariance}{(std\\space of\\space x)(std\\space of\\space y)}$$\n# * $$\\rho = \\frac{variability\\space due\\space to\\space codependence}{independent\\space variability}$$\n# * Comparison of the variability in the data due to codependence (the covariance) to the variability inherent to each variable independently (their standard deviations).\n# * It's dimensionless and ranges from -1 (for complete anticorrelation) to 1 (for complete correlation).\n# * A value of zero means there is no correlation between the data, as shown in the upper left plot.\n# * Good metric for correlation between two variables.\n\n#%%\n\nplt.figure(figsize=(10, 8))\nsns.scatterplot(x='total_votes', y='dem_share', data=swing, hue='state')\nplt.xlabel('total votes')\nplt.ylabel('% of vote for Obama')\nplt.xticks([x for x in range(0, 1000000, 100000)], rotation=40)\nplt.yticks([x for x in range(0, 100, 10)])\n\n# Create a Rectangle patch\nplt.gca().add_patch(Rectangle((400000, 52), 500000, 34,\n linewidth=1, edgecolor='b', facecolor='none'))\n\nplt.gca().add_patch(Rectangle((0, 5), 50000, 45,\n linewidth=1, edgecolor='r', facecolor='none'))\n\n# Annotate\nplt.annotate('12 largest counties; most vote for Obama', xy=(650000, 52), weight='bold',\n xytext=(400000, 35), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='b'))\n\nplt.annotate('small counties; most vote for McCain', xy=(50000, 20), weight='bold',\n xytext=(150000, 7), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='r'))\n\nplt.show()\n\n\n# ### Scatter plots\n#\n# When you made bee swarm plots, box plots, and ECDF plots in previous exercises, you compared the petal lengths of different species of iris. But what if you want to compare two properties of a single species? This is exactly what we will do in this exercise. We will make a **scatter plot** of the petal length and width measurements of Anderson's Iris versicolor flowers. If the flower scales (that is, it preserves its proportion as it grows), we would expect the length and width to be correlated.\n#\n# For your reference, the code used to produce the scatter plot in the video is provided below:\n#\n# ```python\n# _ = plt.plot(total_votes/1000, dem_share, marker='.', linestyle='none')\n# _ = plt.xlabel('total votes (thousands)')\n# _ = plt.ylabel('percent of vote for Obama')\n# ```\n#\n# **Instructions**\n#\n# * Use `plt.plot()` with the appropriate keyword arguments to make a scatter plot of versicolor petal length (x-axis) versus petal width (y-axis). The variables `versicolor_petal_length` and `versicolor_petal_width` are already in your namespace. Do not forget to use the `marker='.'` and `linestyle='none'` keyword arguments.\n# * Label the axes.\n# * Display the plot.\n\n#%%\n\nversicolor_petal_width = iris_df['petal width (cm)'][iris_df.species ==\n 'versicolour']\n\n# Make a scatter plot\n_ = plt.plot(versicolor_petal_length, versicolor_petal_width,\n marker='.', linestyle='none')\n\n# Label the axes\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('petal width (cm)')\n\n# Show the result\nplt.show()\n\n\n# ### Variance and covariance by looking\n#\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/var_covar_by_looking.JPG)\n#\n# Consider four scatter plots of x-y data, appearing to the right. Which has, respectively,\n#\n# * the highest variance in the variable x,\n# * the highest covariance,\n# * negative covariance?\n#\n# **Instructions**\n#\n# Possible Answers\n# * ~~a, c, b~~\n# * ~~d, c, a~~\n# * __**d, c, b**__\n# * ~~d, d, b~~\n\n# ### Computing the covariance\n#\n# The covariance may be computed using the Numpy function `np.cov()`. For example, we have two sets of data `x` and `y`, `np.cov(x, y)` returns a 2D array where entries `[0,1]` and `[1,0]` are the covariances. Entry `[0,0]` is the variance of the data in x, and entry `[1,1]` is the variance of the data in `y`. This 2D output array is called the covariance matrix, since it organizes the self- and covariance.\n#\n# To remind you how the *I. versicolor* petal length and width are related, we include the scatter plot you generated in a previous exercise.\n#\n# **Instructions**\n#\n# * Use `np.cov()` to compute the covariance matrix for the petal length (`versicolor_petal_length`) and width (`versicolor_petal_width`) of *I. versicolor*.\n# * Print the covariance matrix.\n# * Extract the covariance from entry `[0,1]` of the covariance matrix. Note that by symmetry, entry `[1,0]` is the same as entry `[0,1]`.\n# * Print the covariance.\n\n#%%\n\niris_df[['petal length (cm)', 'petal width (cm)']\n ][iris_df.species == 'versicolour'].cov()\n\n#%%\n\n# Compute the covariance matrix: covariance_matrix\ncovariance_matrix = np.cov(versicolor_petal_length, versicolor_petal_width)\n\n# Print covariance matrix\ncovariance_matrix\n\n#%%\n\n# Extract covariance of length and width of petals: petal_cov\npetal_cov = covariance_matrix[0, 1]\n\n# Print the length/width covariance\npetal_cov\n\n\n# ### Computing the Pearson correlation coefficient\n#\n# As mentioned in the video, the Pearson correlation coefficient, also called the Pearson r, is often easier to interpret than the covariance. It is computed using the `np.corrcoef()` function. Like `np.cov()`, it takes two arrays as arguments and returns a 2D array. Entries `[0,0]` and `[1,1]` are necessarily equal to 1 (can you think about why?), and the value we are after is entry `[0,1]`.\n#\n# In this exercise, you will write a function, `pearson_r(x, y)` that takes in two arrays and returns the Pearson correlation coefficient. You will then use this function to compute it for the petal lengths and widths of *I. versicolor*.\n#\n# Again, we include the scatter plot you generated in a previous exercise to remind you how the petal width and length are related.\n#\n# **Instructions**\n#\n# * Define a function with signature `pearson_r(x, y)`.\n# * Use `np.corrcoef()` to compute the correlation matrix of `x` and `y` (pass them to `np.corrcoef()` in that order).\n# * The function returns entry `[0,1]` of the correlation matrix.\n# * Compute the Pearson correlation between the data in the arrays `versicolor_petal_length` and `versicolor_petal_width`. Assign the result to `r`.\n# * Print the result.\n\n#%%\n\niris_df[['petal length (cm)', 'petal width (cm)']\n ][iris_df.species == 'versicolour'].corr()\n\n#%%\n\ndef pearson_r(x, y):\n \"\"\"Compute Pearson correlation coefficient between two arrays.\"\"\"\n # Compute correlation matrix: corr_mat\n corr_mat = np.corrcoef(x, y)\n\n # Return entry [0,1]\n return corr_mat[0, 1]\n\n\n# Compute Pearson correlation coefficient for I. versicolor: r\nr = pearson_r(versicolor_petal_length, versicolor_petal_width)\n\n# Print the result\nprint(r)\n\n\n# # Thinking probabilistically: Discrete variables\n#\n# Statistical inference rests upon probability. Because we can very rarely say anything meaningful with absolute certainty from data, we use probabilistic language to make quantitative statements about data. In this chapter, you will learn how to think probabilistically about discrete quantities, those that can only take certain values, like integers. It is an important first step in building the probabilistic language necessary to think statistically.\n\n# ## Probabilistic logic and statistical inference\n#\n# * Probabilistic reasoning allows us to describe uncertainty\n# * Given a set of data, you describe probabilistically what you might expect if those data were acquired repeatedly\n# * This is the heart of statistical inference\n# * It's the process by which we go from measured data to probabilistic conclusions about what we might expect if we collected the same data again.\n\n# ### What is the goal of statistical inference?\n#\n# Why do we do statistical inference?\n#\n# **Possible Answers**\n#\n# * To draw probabilistic conclusions about what we might expect if we collected the same data again.\n# * To draw actionable conclusions from data.\n# * To draw more general conclusions from relatively few data or observations.\n# * __**All of these.**__\n\n# ### Why do we use the language of probablility?\n#\n# Which of the following is not a reason why we use probabilistic language in statistical inference?\n#\n# **Possible Answers**\n#\n# * Probability provides a measure of uncertainty.\n# * __**Probabilistic language is not very precise.**__\n# * Data are almost never exactly the same when acquired again, and probability allows us to say how much we expect them to vary.\n\n# ## Random number generators and hacker statistics\n#\n# * Instead o repeating data acquisition over and over, repeated measurements can be simulated\n# * The concepts of probabilities originated from games of chance\n# * What's the probability of getting 4 heads with 4 flips of a coin?\n# * This type of data can be generated using `np.random.random`\n# * drawn a number between 0 and 1\n# * $<0.5\\longrightarrow\\text{heads}$\n# * $\\geq0.5\\longrightarrow\\text{tails}$\n# * The pseudo random number generator works by starting with an integer, called a seed, and then generates random numbers in succession\n# * The same seed gives the same sequence of random numbers\n# * Manually seed the random number generator for reproducible results\n# * Specified using `np.random.seed()`\n\n# #### Bernoulli Trial\n#\n# * An experiment that has two options, \"success\" (True) and \"failure\" (False).\n\n# #### Hacker stats probabilities\n#\n# * Determine how to simulate data\n# * Simulated it repeatedly\n# * Compute the fraction of trials that had the outcome of interest\n# * Probability is approximately the fraction of trials with the outcome of interest\n\n# #### Simulated coin flips\n\n#%%\n\nnp.random.seed(42)\nrandom_numbers = np.random.random(size=4)\n\nrandom_numbers\n\n#%%\n\nheads = random_numbers < 0.5\n\nheads\n\n#%%\n\nnp.sum(heads)\n\n\n# * The number of heads can be computed by summing the array of Booleans, because in numerical contexts, Python treats True as 1 and False as 0.\n# * We want to know the probability of getting four heads if we were to repeatedly flip the 4 coins\n\n# * without `list comprehension`\n#\n# ```python\n# n_all_heads = 0 # initialize number of 4-heads trials\n#\n# for _ in range(10000):\n# heads = np.random.random(size=4) < 0.5\n# n_heads = np.sum(heads)\n# if n_heads == 4:\n# n_all_heads += 1\n# ```\n#\n# * with `list comprehension`\n\n#%%\n\nn_all_heads = sum([1 for _ in range(10000) if sum(\n np.random.random(size=4) < 0.5) == 4])\n\n#%%\n\nn_all_heads\n\n#%%\n\nn_all_heads/10000\n\n\n# ### Generating random numbers using the `np.random` module\n#\n# We will be hammering the `np.random` module for the rest of this course and its sequel. Actually, you will probably call functions from this module more than any other while wearing your hacker statistician hat. Let's start by taking its simplest function, `np.random.random()` for a test spin. The function returns a random number between zero and one. Call `np.random.random()` a few times in the IPython shell. You should see numbers jumping around between zero and one.\n#\n# In this exercise, we'll generate lots of random numbers between zero and one, and then plot a histogram of the results. If the numbers are truly random, all bars in the histogram should be of (close to) equal height.\n#\n# You may have noticed that, in the video, Justin generated 4 random numbers by passing the keyword argument `size=4` to `np.random.random()`. Such an approach is more efficient than a `for` loop: in this exercise, however, you will write a `for` loop to experience hacker statistics as the practice of repeating an experiment over and over again.\n#\n# **Instructions**\n#\n# * Seed the random number generator using the seed `42`.\n# * Initialize an empty array, `random_numbers`, of 100,000 entries to store the random numbers. Make sure you use `np.empty(100000)` to do this.\n# * Write a `for` loop to draw 100,000 random numbers using `np.random.random()`, storing them in the `random_numbers` array. To do so, loop over `range(100000)`.\n# * Plot a histogram of `random_numbers`. It is not necessary to label the axes in this case because we are just checking the random number generator. Hit 'Submit Answer' to show your plot.\n\n#%%\n\n# Seed the random number generator\nnp.random.seed(42)\n\n# Initialize random numbers: random_numbers\nrandom_numbers = np.empty(100000)\n\n# Generate random numbers by looping over range(100000)\nfor i in range(100000):\n random_numbers[i] = np.random.random()\n\n# Plot a histogram\n_ = plt.hist(random_numbers)\n\n# Show the plot\nplt.show()\n\n#%%\n\nsns.distplot(random_numbers)\nplt.show()\n\n\n# **The histogram is nearly flat across the top, indicating there is equal chance a randomly-generated number is in any of the histogram bins.**\n#\n# * [Generating Random Numbers With `NumPy`](https://chrisalbon.com/python/basics/generating_random_numbers_with_numpy/)\n\n# #### Using `np.random.rand`\n\n#%%\n\nrand_num = np.random.rand(100000)\n\n#%%\n\nsns.distplot(rand_num)\nplt.show()\n\n\n# ### The np.random module and Bernoulli trials\n#\n# You can think of a Bernoulli trial as a flip of a possibly biased coin. Specifically, each coin flip has a probability ***p*** of landing heads (success) and probability ***1\u2212p*** of landing tails (failure). In this exercise, you will write a function to perform `n` [Bernoulli trials](https://en.wikipedia.org/wiki/Bernoulli_trial), `perform_bernoulli_trials(n, p)`, which returns the number of successes out of `n` Bernoulli trials, each of which has probability `p` of success. To perform each Bernoulli trial, use the `np.random.random()` function, which returns a random number between zero and one.\n#\n# **Instructions**\n#\n# * Define a function with signature `perform_bernoulli_trials(n, p)`.\n# * Initialize to zero a variable `n_success` the counter of `True` occurrences, which are Bernoulli trial successes.\n# * Write a `for` loop where you perform a Bernoulli trial in each iteration and increment the number of success if the result is `True`. Perform `n` iterations by looping over `range(n)`.\n# * To perform a Bernoulli trial, choose a random number between zero and one using `np.random.random()`. If the number you chose is less than `p`, increment n_success (use the `+= 1` operator to achieve this).\n# * The function returns the number of successes `n_success`.\n\n# #### def perform_bernoulli_trials()\n\n#%%\n\ndef perform_bernoulli_trials(n: int = 100000, p: float = 0.5) -> int:\n \"\"\"\n Perform n Bernoulli trials with success probability p\n and return number of successes.\n n: number of iterations\n p: target number between 0 and 1, inclusive\n \"\"\"\n # Initialize number of successes: n_success\n n_success = 0\n\n # Perform trials\n for i in range(n):\n # Choose random number between zero and one: random_number\n random_number = np.random.random()\n\n # If less than p, it's a success so add one to n_success\n if random_number < p:\n n_success += 1\n\n return n_success\n\n\n# ##### With `list comprehension`\n\n#%%\n\ndef perform_bernoulli_trials(n: int = 100000, p: float = 0.5) -> int:\n \"\"\"\n Perform n Bernoulli trials with success probability p\n and return number of successes.\n n: number of iterations\n p: target number between 0 and 1, inclusive\n \"\"\"\n\n return sum([1 for _ in range(n) if np.random.random() < p])\n\n\n# ### How many defaults might we expect?\n#\n# Let's say a bank made 100 mortgage loans. It is possible that anywhere between 0 and 100 of the loans will be defaulted upon. You would like to know the probability of getting a given number of defaults, given that the probability of a default is `p = 0.05`. To investigate this, you will do a simulation. You will perform 100 Bernoulli trials using the `perform_bernoulli_trials()` function you wrote in the previous exercise and record how many defaults we get. Here, a success is a default. (Remember that the word \"success\" just means that the Bernoulli trial evaluates to `True`, i.e., did the loan recipient default?) You will do this for another 100 Bernoulli trials. And again and again until we have tried it 1000 times. Then, you will plot a histogram describing the probability of the number of defaults.\n#\n# **Instructions**\n#\n# * Seed the random number generator to 42.\n# * Initialize `n_defaults`, an empty array, using `np.empty()`. It should contain 1000 entries, since we are doing 1000 simulations.\n# * Write a `for` loop with `1000` iterations to compute the number of defaults per 100 loans using the `perform_bernoulli_trials()` function. It accepts two arguments: the number of trials `n` - in this case 100 - and the probability of success `p` - in this case the probability of a default, which is `0.05`. On each iteration of the loop store the result in an entry of `n_defaults`.\n# * Plot a histogram of `n_defaults`. Include the `normed=True` keyword argument so that the height of the bars of the histogram indicate the probability.\n\n#%%\n\n# Seed random number generator\nnp.random.seed(42)\n\n# Initialize the number of defaults: n_defaults\nn_defaults = np.empty(1000)\n\n# Compute the number of defaults\nfor i in range(1000):\n n_defaults[i] = perform_bernoulli_trials(100, 0.05)\n\n\n# Plot the histogram with default number of bins; label your axes\n_ = plt.hist(n_defaults, density=True)\n_ = plt.xlabel('number of defaults out of 100 loans')\n_ = plt.ylabel('probability')\n\n# Show the plot\nplt.show()\n\n\n# **This is not an optimal way to plot a histogram when the results are known to be integers. This will be revisited in forthcoming exercises.**\n\n# #### With `list comprehension`\n\n#%%\n\nnp.random.seed(42)\nn_defaults = np.asarray([perform_bernoulli_trials(100, 0.05)\n for _ in range(1000)])\n\nplt.hist(n_defaults, density=True)\nplt.xlabel('number of defaults out of 100 loans')\nplt.ylabel('probability')\nplt.show()\n\n\n# ### Will the bank fail?\n#\n# Using `def ecdf()` from the first section, plot the number of `n_defaults` from the previous exercise, as a CDF.\n#\n# If interest rates are such that the bank will lose money if 10 or more of its loans are defaulted upon, what is the probability that the bank will lose money?\n#\n# **Instructions**\n#\n# * Compute the `x` and `y` values for the ECDF of `n_defaults`.\n# * Plot the ECDF, making sure to label the axes. Remember to include `marker='.'` and `linestyle='none'` in addition to `x` and `y` in your call `plt.plot()`.\n# * Show the plot.\n# * Compute the total number of entries in your `n_defaults` array that were greater than or equal to 10. To do so, compute a boolean array that tells you whether a given entry of `n_defaults` is `>= 10`. Then sum all the entries in this array using `np.sum()`. For example, `np.sum(n_defaults <= 5)` would compute the number of defaults with 5 or *fewer* defaults.\n# * The probability that the bank loses money is the fraction of `n_defaults` that are greater than or equal to 10.\n\n#%%\n\n# Compute ECDF: x, y\nx, y = ecdf(n_defaults)\n\n# Plot the ECDF with labeled axes\nplt.plot(x, y, marker='.', linestyle='none')\nplt.xlabel('Number of Defaults out of 100')\nplt.ylabel('CDF')\n\n# Show the plot\nplt.show()\n\n# Compute the number of 100-loan simulations with 10 or more defaults: n_lose_money\nn_lose_money = sum(n_defaults >= 10)\n\n# Compute and print probability of losing money\nprint('Probability of losing money =', n_lose_money / len(n_defaults))\n\n\n# **As might be expected, about 5/100 defaults occur. There's about a 2% chance of getting 10 or more defaults out of 100 loans.**\n\n# ## Probability distributions and stories: The Binomial distribution\n#\n# * [Probability Distributions in Python](https://www.datacamp.com/community/tutorials/probability-distributions-python)\n\n# #### Probability Mass Function (PMF)\n#\n# * [Probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)\n# * The set of probabilities of discrete outcomes\n# * PMF is a property of a discrete probability distribution\n\n# #### Discrete Uniform PMF\n#\n# * ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/discrete_uniform_pmf.JPG)\n# * The outcomes are discrete because only certain values may be attained; there is not option for 3.7\n# * Each result has a uniform probability of 1/6\n\n# #### Probability Distribution\n#\n# * [Probability distribution](https://en.wikipedia.org/wiki/Probability_distribution)\n# * A mathematical description of outcomes\n\n# #### Discrete Uniform Distribution\n#\n# * [Discrete uniform distribution](https://en.wikipedia.org/wiki/Discrete_uniform_distribution)\n# * The outcome of rolling a single fair die, is Discrete Uniformly distributed\n\n# #### Binomial Distribution\n#\n# * [Binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution)\n# * The number ***r*** of successes in ***n*** Bernoulli trials with probability ***p*** of success, is Binomially distributed\n# * The number ***r*** of heads in 4 coin flips with probability ***p = 0.5*** of heads, is Binomially distributed\n\n#%%\n\nnp.random.binomial(4, 0.5)\n\n#%%\n\nnp.random.binomial(4, 0.5, size=10)\n\n\n# ##### Binomial PMF\n#\n# * To plot the Binomial PMF, take 10000 samples from a Binomial distribution of 60 Bernoulli trials with a probability of success of 0.1\n# * The most likely number of successes is 6 out of 60, but it's possible to get as many as 11 or as few as 1\n# * [`scipy.stats.binom`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom.html)\n\n#%%\n\nnp.random.seed(42)\nsamples = np.random.binomial(60, 0.1, size=10_000)\nsamples\n\n\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/binomial_pmf.JPG)\n\n#%%\n\nn, p = 60, 0.1\nx = [x for x in range(17)]\n\nfig, ax = plt.subplots(1, 1)\nax.plot(x, binom.pmf(x, n, p), 'bo', ms=5, label='binom pmf')\nax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=3, alpha=0.5)\nplt.xticks(x)\nplt.ylabel('probability')\nplt.xlabel('number of successes')\nplt.show()\n\n#%%\n\nsns.set()\nx, y = ecdf(samples)\n\nplt.plot(x, y, marker='.', linestyle='none')\nplt.margins(0.02)\nplt.xlabel('Number of Successes')\nplt.ylabel('CDF')\nplt.show()\n\n\n# ### Sampling out of the Binomial distribution\n#\n# Compute the probability mass function for the number of defaults we would expect for 100 loans as in the last section, but instead of simulating all of the Bernoulli trials, perform the sampling using `np.random.binomial()`. This is identical to the calculation you did in the last set of exercises using your custom-written `perform_bernoulli_trials()` function, but far more computationally efficient. Given this extra efficiency, we will take 10,000 samples instead of 1000. After taking the samples, plot the CDF as last time. This CDF that you are plotting is that of the Binomial distribution.\n#\n# Note: For this exercise and all going forward, the random number generator is pre-seeded for you (with `np.random.seed(42)`) to save you typing that each time.\n#\n# **Instructions**\n#\n# * Draw samples out of the Binomial distribution using `np.random.binomial()`. You should use parameters `n = 100` and `p = 0.05`, and set the `size = 10000`.\n# * Compute the CDF using your previously-written `ecdf()` function.\n# * Plot the CDF with axis labels. The x-axis here is the ***number of defaults out of 100 loans***, while the y-axis is the ***CDF***.\n\n#%%", "target_code": "n_defaults = np.random.binomial(100, 0.05, size=10_000)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## **Statistical Thinking in Python (Part 1)**\n#\n# **Course Description**\n#\n# After all of the hard work of acquiring data and getting them into a form you can work with, you ultimately want to make clear, succinct conclusions from them. This crucial last step of a data analysis pipeline hinges on the principles of statistical inference. In this course, you will start building the foundation you need to think statistically, to speak the language of your data, to understand what they are telling you. The foundations of statistical thinking took decades upon decades to build, but they can be grasped much faster today with the help of computers. With the power of Python-based tools, you will rapidly get up to speed and begin thinking statistically by the end of this course.\n\n# **Imports**\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.patches import Rectangle\nimport numpy as np\nfrom pprint import pprint as pp\nimport csv\nfrom pathlib import Path\nimport seaborn as sns\nfrom scipy.stats import binom\n\nfrom sklearn.datasets import load_iris\n\n\n# **Pandas Configuration Options**\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# **Data Files Location**\n#\n# * Most data files for the exercises can be found on the [course site](https://www.datacamp.com/courses/statistical-thinking-in-python-part-1)\n# * [2008 election results (all states)](https://assets.datacamp.com/production/repositories/469/datasets/8fb59b9a99957c3b9b1c82b623aea54d8ccbcd9f/2008_all_states.csv)\n# * [2008 election results (swing states)](https://assets.datacamp.com/production/repositories/469/datasets/e079fddb581197780e1a7b7af2aeeff7242535f0/2008_swing_states.csv)\n# * [Belmont Stakes](https://assets.datacamp.com/production/repositories/469/datasets/7507bfed990379f246b4f166ea8a57ecf31c6c9d/belmont.csv)\n# * [Speed of light](https://assets.datacamp.com/production/repositories/469/datasets/df23780d215774ff90be0ea93e53f4fb5ebbade8/michelson_speed_of_light.csv)\n\n# **Data File Objects**\n\n\ndata = Path.cwd() / 'data' / 'statistical_thinking_1'\nelections_all_file = data / '2008_all_states.csv'\nelections_swing_file = data / '2008_swing_states.csv'\nbelmont_file = data / 'belmont.csv'\nsol_file = data / 'michelson_speed_of_light.csv'\n\n\n# **Iris Data Set**\n\n\niris = load_iris()\niris_df = pd.DataFrame(data=np.c_[\n iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])\n\n\ndef iris_typing(x):\n types = {0.0: 'setosa',\n 1.0: 'versicolour',\n 2.0: 'virginica'}\n return types[x]\n\n\niris_df['species'] = iris_df.target.apply(iris_typing)\niris_df.head()\n\n\n# # Graphical exploratory data analysis\n#\n# Look before you leap! A very important proverb, indeed. Prior to diving in headlong into sophisticated statistical inference techniques, you should first explore your data by plotting them and computing simple summary statistics. This process, called exploratory data analysis, is a crucial first step in statistical analysis of data. So it is a fitting subject for the first chapter of Statistical Thinking in Python.\n\n# ## Introduction to exploratory data analysis\n#\n# * Exploring the data is a crucial step of the analysis.\n# * Organizing\n# * Plotting\n# * Computing numerical summaries\n# * This idea is known as exploratory data analysis (EDA)\n# * \"Exploratory data analysis can never be the whole story, but nothing else can serve as the foundation stone.\" - [John Tukey](https://en.wikipedia.org/wiki/John_Tukey)\n\n\nswing = pd.read_csv(elections_swing_file)\nswing.head()\n\n\n# * The raw data isn't particularly informative\n# * We could start computing parameters and their confidence intervals and do hypothesis test...\n# * ...however, we should graphically explore the data first\n\n# ### Tukey's comments on EDA\n#\n# Even though you probably have not read Tukey's book, I suspect you already have a good idea about his viewpoint from the video introducing you to exploratory data analysis. Which of the following quotes is not directly from Tukey?\n#\n# 1. Exploratory data analysis is detective work.\n# 1. There is no excuse for failing to plot and look.\n# 1. The greatest value of a picture is that it forces us to notice what we never expected to see.\n# 1. It is important to understand what you can do before you learn how to measure how well you seem to have done it.\n# 1. ~~**Often times EDA is too time consuming, so it is better to jump right in and do your hypothesis tests.**~~\n\n# ### Advantages of graphical EDA\n#\n# Which of the following is not true of graphical EDA?\n#\n# 1. It often involves converting tabular data into graphical form.\n# 1. If done well, graphical representations can allow for more rapid interpretation of data.\n# 1. ~~**A nice looking plot is always the end goal of a statistical analysis.**~~\n# 1. There is no excuse for neglecting to do graphical EDA.\n\n# ## Plotting a histogram\n#\n# * always label the axes\n\n\nbin_edges = [x for x in range(0, 110, 10)]\nplt.hist(x=swing.dem_share, bins=bin_edges, edgecolor='black')\nplt.xticks(bin_edges)\nplt.yticks(bin_edges[:-1])\nplt.xlabel('Percent of vote for Obama')\nplt.ylabel('Number of Counties')\nplt.show()\n\n\n# **Seaborn**\n\n\nsns.set()\n\n\nplt.hist(x=swing.dem_share)\nplt.xlabel('Percent of vote for Obama')\nplt.ylabel('Number of Counties')\nplt.show()\n\n\n# ### Plotting a histogram of iris data\n#\n# For the exercises in this section, you will use a classic data set collected by botanist Edward Anderson and made famous by Ronald Fisher, one of the most prolific statisticians in history. Anderson carefully measured the anatomical properties of samples of three different species of iris, *Iris setosa*, *Iris versicolor*, and *Iris virginica*. The full data set is [available as part of scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html). Here, you will work with his measurements of petal length.\n#\n# Plot a histogram of the petal lengths of his 50 samples of Iris versicolor using matplotlib/seaborn's default settings. Recall that to specify the default seaborn style, you can use `sns.set()`, where `sns` is the alias that `seaborn` is imported as.\n#\n# The subset of the data set containing the Iris versicolor petal lengths in units of centimeters (cm) is stored in the NumPy array `versicolor_petal_length`.\n#\n# In the video, Justin plotted the histograms by using the `pandas` library and indexing the DataFrame to extract the desired column. Here, however, you only need to use the provided NumPy array. Also, Justin assigned his plotting statements (except for `plt.show()`) to the dummy variable `_`. This is to prevent unnecessary output from being displayed. It is not required for your solutions to these exercises, however it is good practice to use it. Alternatively, if you are working in an interactive environment such as a Jupyter notebook, you could use a `;` after your plotting statements to achieve the same effect. Justin prefers using `_`. Therefore, you will see it used in the solution code.\n#\n# **Instructions**\n#\n# * Import `matplotlib.pyplot` and `seaborn` as their usual aliases (`plt` and `sns`).\n# * Use `seaborn` to set the plotting defaults.\n# * Plot a histogram of the Iris versicolor petal lengths using `plt.hist()` and the provided NumPy array `versicolor_petal_length`.\n# * Show the histogram using `plt.show()`.\n\n\nversicolor_petal_length = iris_df['petal length (cm)'][iris_df.species ==\n 'versicolour']\n\n\nplt.hist(versicolor_petal_length)\nplt.show()\n\n\n# ### Axis labels!\n#\n# In the last exercise, you made a nice histogram of petal lengths of Iris versicolor, but **you didn't label the axes!** That's ok; it's not your fault since we didn't ask you to. Now, add axis labels to the plot using `plt.xlabel()` and `plt.ylabel()`. Don't forget to add units and assign both statements to `_`. The packages `matplotlib.pyplot` and `seaborn` are already imported with their standard aliases. This will be the case in what follows, unless specified otherwise.\n#\n# **Instructions**\n#\n# * Label the axes. Don't forget that you should always include units in your axis labels. Your y-axis label is just `'count'`. Your x-axis label is `'petal length (cm)'`. The units are essential!\n# * Display the plot constructed in the above steps using `plt.show()`.\n\n\nplt.hist(versicolor_petal_length)\nplt.xlabel('petal length (cm)')\nplt.ylabel('count')\nplt.show()\n\n\n# ### Adjusting the number of bins in a histogram\n#\n# The histogram you just made had ten bins. This is the default of matplotlib. The \"square root rule\" is a commonly-used rule of thumb for choosing number of bins: choose the number of bins to be the square root of the number of samples. Plot the histogram of Iris versicolor petal lengths again, this time using the square root rule for the number of bins. You specify the number of bins using the `bins` keyword argument of `plt.hist()`.\n#\n# The plotting utilities are already imported and the seaborn defaults already set. The variable you defined in the last exercise, `versicolor_petal_length`, is already in your namespace.\n#\n# **Instructions**\n#\n# * Import `numpy` as `np`. This gives access to the square root function, `np.sqrt()`.\n# * Determine how many data points you have using `len()`.\n# * Compute the number of bins using the square root rule.\n# * Convert the number of bins to an integer using the built in `int()` function.\n# * Generate the histogram and make sure to use the `bins` keyword argument.\n# * Hit 'Submit Answer' to plot the figure and see the fruit of your labors!\n\n\n# Compute number of data points: n_data\nn_data = len(versicolor_petal_length)\n\n# Number of bins is the square root of number of data points: n_bins\nn_bins = np.sqrt(n_data)\n\n# Convert number of bins to integer: n_bins\nn_bins = int(n_bins)\n\n# Plot the histogram\n_ = plt.hist(versicolor_petal_length, bins=n_bins)\n\n# Label axes\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('count')\n\n# Show histogram\nplt.show()\n\n\n# ## Plotting all of your data: Bee swarm plots\n#\n# * Binning Bias: The same data may be interpreted differently depending on choice of bins\n# * Additionally, all of the data isn't being plotted; the precision of the actual data is lost in the bins\n# * These issues can be resolved with swarm plots\n# * Point position along the y-axis is the quantitative information\n# * The data are spread in x to make them visible, but their precise location along the x-axis is unimportant\n# * No binning bias and all the data are displayed.\n# * Seaborn & Pandas\n\n\nsns.swarmplot(x='state', y='dem_share', data=swing)\nplt.xlabel('state')\nplt.ylabel('percent of vote for Obama')\nplt.title('% of Vote per Swing State County')\nplt.show()\n\n\n# ### Bee swarm plot\n#\n# Make a bee swarm plot of the iris petal lengths. Your x-axis should contain each of the three species, and the y-axis the petal lengths. A data frame containing the data is in your namespace as `df`.\n#\n# For your reference, the code Justin used to create the bee swarm plot in the video is provided below:\n#\n# ```python\n# _ = sns.swarmplot(x='state', y='dem_share', data=df_swing)\n# _ = plt.xlabel('state')\n# _ = plt.ylabel('percent of vote for Obama')\n# plt.show()\n# ```\n#\n# In the IPython Shell, you can use `sns.swarmplot`? or `help(sns.swarmplot)` for more details on how to make bee swarm plots using seaborn.\n#\n# **Instructions**\n#\n# * In the IPython Shell, inspect the DataFrame `df` using `df.head()`. This will let you identify which column names you need to pass as the `x` and `y` keyword arguments in your call to `sns.swarmplot()`.\n# * Use `sns.swarmplot()` to make a bee swarm plot from the DataFrame containing the Fisher iris data set, `df`. The x-axis should contain each of the three species, and the y-axis should contain the petal lengths.\n# * Label the axes.\n# * Show your plot.\n\n\nsns.swarmplot(x='species', y='petal length (cm)', data=iris_df)\nplt.xlabel('species')\nplt.ylabel('petal length (cm)')\nplt.show()\n\n\n# ### Interpreting a bee swarm plot\n#\n# Which of the following conclusions could you draw from the bee swarm plot of iris petal lengths you generated in the previous exercise? For your convenience, the bee swarm plot is regenerated and shown to the right.\n#\n# **Instructions**\n#\n# Possible Answers\n# 1. All I. versicolor petals are shorter than I. virginica petals.\n# 1. I. setosa petals have a broader range of lengths than the other two species.\n# 1. __**I. virginica petals tend to be the longest, and I. setosa petals tend to be the shortest of the three species.**__\n# 1. I. versicolor is a hybrid of I. virginica and I. setosa.\n\n# ## Plotting all of your data: Empirical cumulative distribution functions\n#\n# * [Empirical Distribution Function](https://en.wikipedia.org/wiki/Empirical_distribution_function)\n# * [Empirical Distribution Function / Empirical CDF](https://www.statisticshowto.datasciencecentral.com/empirical-distribution-function/)\n# * An **empirical cumulative distribution function** (also called the empirical distribution function, **ECDF**, or just **EDF**) and a [cumulative distribution function](https://www.statisticshowto.datasciencecentral.com/cumulative-distribution-function/) are basically the same thing; they are both probability models for data. While ***a CDF is a hypothetical model of a distribution***, **the ECDF models empirical (i.e. observed) data**. To put this another way, __**the ECDF is the [probability distribution](https://www.statisticshowto.datasciencecentral.com/probability-distribution/) you would get if you sampled from your [sample](https://www.statisticshowto.datasciencecentral.com/sample/), instead of the [population](https://www.statisticshowto.datasciencecentral.com/what-is-a-population/)**__. Lets say you have a set of experimental (observed) data $x_{1},x_{2},\\,\\ldots\\,x_{n}$. The EDF will give you the fraction of sample observations less than or equal to a particular value of $x$.\n# * More formally, if you have a set of [order statistics](https://www.statisticshowto.datasciencecentral.com/order-statistics/) ($y_{1}\", color='b'))\n\nax.annotate('75% of counties had < 0.5 vote for Obama', xy=(50, .75),\n xytext=(55, 0.6), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='b'))\n\nplt.show()\n\n\n# #### plot multiple ECDFs\n\n\nfig, ax = plt.subplots(figsize=(10, 5))\nax.margins(0.05) # Default margin is 0.05, value 0 means fit\n\nfor state in swing.state.unique():\n x = np.sort(swing['dem_share'][swing.state == state])\n y = np.arange(1, len(x)+1) / len(x)\n ax.plot(x, y, marker='.', linestyle='none', label=state)\n\nplt.xlabel('percent of vote for Obama')\nplt.ylabel('ECDF')\nplt.legend()\n\nplt.show()\n\n\n# ### Computing the ECDF\n#\n# In this exercise, you will write a function that takes as input a 1D array of data and then returns the `x` and `y` values of the ECDF. You will use this function over and over again throughout this course and its sequel. ECDFs are among the most important plots in statistical analysis. You can write your own function, `foo(x,y)` according to the following skeleton:\n#\n# ```python\n# def foo(a,b):\n# \"\"\"State what function does here\"\"\"\n# # Computation performed here\n# return x, y\n# ```\n#\n# The function `foo()` above takes two arguments `a` and `b` and returns two values `x` and `y`. The function header `def foo(a,b):` contains the function signature `foo(a,b)`, which consists of the function name, along with its parameters. For more on writing your own functions, see [DataCamp's course Python Data Science Toolbox (Part 1)](https://www.datacamp.com/courses/python-data-science-toolbox-part-1)!\n#\n# **Instructions**\n#\n# * Define a function with the signature `ecdf(data)`. Within the function definition,\n# * Compute the number of data points, `n`, using the `len()` function.\n# * The **x**-values are the sorted data. Use the `np.sort()` function to perform the sorting.\n# * The **y** data of the ECDF go from `1/n` to `1` in equally spaced increments. You can construct this using `np.arange()`. Remember, however, that the end value in `np.arange()` is not inclusive. Therefore, `np.arange()` will need to go from `1` to `n+1`. Be sure to divide this by `n`.\n# * The function returns the values `x` and `y`.\n\n# #### def ecdf()\n\n\ndef ecdf(data):\n \"\"\"Compute ECDF for a one-dimensional array of measurements.\"\"\"\n # Number of data points: n\n n = len(data)\n\n # x-data for the ECDF: x\n x = np.sort(data)\n\n # y-data for the ECDF: y\n y = np.arange(1, n+1) / n\n\n return x, y\n\n\n# ### Plotting the ECDF\n#\n# You will now use your `ecdf()` function to compute the ECDF for the petal lengths of Anderson's *Iris versicolor* flowers. You will then plot the ECDF. Recall that your `ecdf()` function returns two arrays so you will need to unpack them. An example of such unpacking is `x, y = foo(data)`, for some function `foo()`.\n#\n# **Instructions**\n#\n# * Use `ecdf()` to compute the ECDF of `versicolor_petal_length`. Unpack the output into `x_vers` and `y_vers`.\n# * Plot the ECDF as dots. Remember to include `marker = '.'` and `linestyle = 'none'` in addition to `x_vers` and `y_vers` as arguments inside `plt.plot()`.\n# * Label the axes. You can label the y-axis `'ECDF'`.\n# * Show your plot.\n\n\n# Compute ECDF for versicolor data: x_vers, y_vers\nx, y = ecdf(versicolor_petal_length)\n\n# Generate plot\nplt.plot(x, y, marker='.', linestyle='none')\n\n# Label the axes\nplt.xlabel('Versicolor Petal Length (cm)')\nplt.ylabel('ECDF')\n\n# Display the plot\nplt.margins(0.02) # keep data off plot edges\nplt.show()\n\n\n# ### Comparison of ECDFs\n#\n# ECDFs also allow you to compare two or more distributions (though plots get cluttered if you have too many). Here, you will plot ECDFs for the petal lengths of all three iris species. You already wrote a function to generate ECDFs so you can put it to good use!\n#\n# To overlay all three ECDFs on the same plot, you can use `plt.plot()` three times, once for each ECDF. Remember to include `marker='.'` and `linestyle='none'` as arguments inside `plt.plot()`.\n#\n# **Instructions**\n#\n# * Compute ECDFs for each of the three species using your `ecdf()` function. The variables `setosa_petal_length`, `versicolor_petal_length`, and `virginica_petal_length` are all in your namespace. Unpack the ECDFs into `x_set`, `y_set`, `x_vers`, `y_vers` and `x_virg`, `y_virg`, respectively.\n# * Plot all three ECDFs on the same plot as dots. To do this, you will need three `plt.plot()` commands. Assign the result of each to `_`.\n# * A legend and axis labels have been added for you, so hit 'Submit Answer' to see all the ECDFs!\n\n\nvirginica_petal_length = iris_df['petal length (cm)'][iris_df.species ==\n 'virginica']\nsetosa_petal_length = iris_df['petal length (cm)'][iris_df.species == 'setosa']\n\n# Compute ECDFs\nx_set, y_set = ecdf(setosa_petal_length)\nx_vers, y_vers = ecdf(versicolor_petal_length)\nx_virg, y_virg = ecdf(virginica_petal_length)\n\n# Plot all ECDFs on the same plot\nplt.plot(x_set, y_set, marker='.', linestyle='none')\nplt.plot(x_vers, y_vers, marker='.', linestyle='none')\nplt.plot(x_virg, y_virg, marker='.', linestyle='none')\n\n# Annotate the plot\nplt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('ECDF')\n\n# Display the plot\nplt.show()\n\n\n# ## Onward toward the whole story\n#\n# * Start with graphical eda!\n#\n# **Coming up...**\n#\n# * Thinking probabilistically\n# * Discrete and continuous distributions\n# * The power of hacker statistics using np.random()\n\n# # Quantitative exploratory data analysis\n#\n# In the last chapter, you learned how to graphically explore data. In this chapter, you will compute useful summary statistics, which serve to concisely describe salient features of a data set with a few numbers.\n\n# ## Introduction to summary statistics: The sample mean and median\n#\n# * mean - average\n# * heavily influenced by outliers\n# * `np.mean()`\n# * median - middle value of the sorted dataset\n# * immune to outlier influence\n# * `np.median()`\n\n# ### Means and medians\n#\n# Which one of the following statements is true about means and medians?\n#\n# **Possible Answers**\n#\n# * ~~An outlier can significantly affect the value of both the mean and the median.~~\n# * **An outlier can significantly affect the value of the mean, but not the median.**\n# * ~~Means and medians are in general both robust to single outliers.~~\n# * ~~The mean and median are equal if there is an odd number of data points.~~\n\n# ### Computing means\n#\n# The mean of all measurements gives an indication of the typical magnitude of a measurement. It is computed using `np.mean()`.\n#\n# **Instructions**\n#\n# * Compute the mean petal length of Iris versicolor from Anderson's classic data set. The variable `versicolor_petal_length` is provided in your namespace. Assign the mean to `mean_length_vers`.\n\n\n# Compute the mean: mean_length_vers\nmean_length_vers = np.mean(versicolor_petal_length)\n\n# Print the result with some nice formatting\nprint('I. versicolor:', mean_length_vers, 'cm')\n\n\n# #### with pandas.DataFrame\n\n\niris_df.groupby(['species']).mean()\n\n\n# ## Percentiles, outliers and box plots\n#\n# * The median is a special name for the 50th percentile\n# * 50% of the data are less than the median\n# * The 25th percentile is the value of the data point that is greater than 25% of the sorted data\n# * percentiles are useful summary statistics and can be computed using `np.percentile()`\n#\n# **Computing Percentiles**\n#\n# ```python\n# np.percentile(df_swing['dem_share'], [25, 50, 75])\n# ```\n#\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/box_plot.JPG)\n#\n# * Box plots are a graphical methode for displying summary statistics\n# * median is the middle line: 50th percentile\n# * bottom and top line of the box represent the 25th & 75th percentile, repectively\n# * the space between the 25th and 75th percentile is the interquartile range (IQR)\n# * Whiskers extent a distance of 1.5 time the IQR, or the extent of the data, whichever is less extreme\n# * Any points outside the whiskers are plotted as individual points, which we demarcate as outliers\n# * There is no single definition for an outlier, however, being more than 2 IQRs away from the median is a common criterion.\n# * An outlier is not necessarily erroneous\n# * Box plots are a great alternative to bee swarm plots, becasue bee swarm plots become too cluttered with large data sets\n\n\nall_states = pd.read_csv(elections_all_file)\nall_states.head()\n\n\nsns.boxplot(x='east_west', y='dem_share', data=all_states)\nplt.xlabel('region')\nplt.ylabel('percent of vote for Obama')\nplt.show()\n\n\n# ### Computing percentiles\n#\n# In this exercise, you will compute the percentiles of petal length of *Iris versicolor*.\n#\n# **Instructions**\n#\n# * Create `percentiles`, a NumPy array of percentiles you want to compute. These are the 2.5th, 25th, 50th, 75th, and 97.5th. You can do so by creating a list containing these ints/floats and convert the list to a NumPy array using `np.array()`. For example, `np.array([30, 50])` would create an array consisting of the 30th and 50th percentiles.\n# * Use `np.percentile()` to compute the percentiles of the petal lengths from the Iris versicolor samples. The variable `versicolor_petal_length` is in your namespace.\n\n\n# Specify array of percentiles: percentiles\npercentiles = np.array([2.5, 25, 50, 75, 97.5])\n\n# Compute percentiles: ptiles_vers\nptiles_vers = np.percentile(versicolor_petal_length, percentiles)\n\n# Print the result\nptiles_vers\n\n\n# ### Comparing percentiles to ECDF\n#\n# To see how the percentiles relate to the ECDF, you will plot the percentiles of Iris versicolor petal lengths you calculated in the last exercise on the ECDF plot you generated in chapter 1. The percentile variables from the previous exercise are available in the workspace as `ptiles_vers` and `percentiles`.\n#\n# Note that to ensure the Y-axis of the ECDF plot remains between 0 and 1, you will need to rescale the `percentiles` array accordingly - in this case, dividing it by 100.\n#\n# **Instructions**\n#\n# * Plot the percentiles as red diamonds on the ECDF. Pass the x and y co-ordinates - `ptiles_vers` and `percentiles/100` - as positional arguments and specify the `marker='D'`, `color='red'` and `linestyle='none'` keyword arguments. The argument for the y-axis - `percentiles/100` has been specified for you.\n\n\n# Plot the ECDF\n_ = plt.plot(x_vers, y_vers, '.')\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('ECDF')\n\n# Overlay percentiles as red diamonds.\n_ = plt.plot(ptiles_vers, percentiles/100, marker='D',\n color='red', linestyle='none')\nplt.show()\n\n\n# ### Box-and-whisker plot\n#\n# Making a box plot for the petal lengths is unnecessary because the iris data set is not too large and the bee swarm plot works fine. However, it is always good to get some practice. Make a box plot of the iris petal lengths. You have a pandas DataFrame, `df`, which contains the petal length data, in your namespace. Inspect the data frame `df` in the IPython shell using `df.head()` to make sure you know what the pertinent columns are.\n#\n# For your reference, the code used to produce the box plot in the video is provided below:\n#\n# ```python\n# _ = sns.boxplot(x='east_west', y='dem_share', data=df_all_states)\n#\n# _ = plt.xlabel('region')\n#\n# _ = plt.ylabel('percent of vote for Obama')\n# ```\n#\n# In the IPython Shell, you can use `sns.boxplot?` or `help(sns.boxplot)` for more details on how to make box plots using seaborn.\n#\n# **Instructions**\n#\n# * The set-up is exactly the same as for the bee swarm plot; you just call `sns.boxplot()` with the same keyword arguments as you would `sns.swarmplot()`. The x-axis is `'species'` and y-axis is `'petal length (cm)'`.\n# * Don't forget to label your axes!\n\n\nfig, ax = plt.subplots(figsize=(10, 7))\n# Create box plot with Seaborn's default settings\n_ = sns.boxplot(x='species', y='petal length (cm)', data=iris_df)\n\n# Label the axes\n_ = plt.ylabel('petal length (cm)')\n_ = plt.xlabel('species')\n\n# Show the plot\nplt.show()\n\n\n# ## Variance and standard deviation\n#\n# * measures of spread\n# * variance:\n# * The mean squared distance of the data from the mean\n# * $$variance = \\frac{1}{n}\\sum_{i=1}^{n}(x_{i} - \\overline{x})^2$$\n# * because of the squared quantity, variance doesn't have the same units as the measurement\n# * standard deviation:\n# * $$\\sqrt{variance}$$\n\n# #### Variance\n\n\ndem_share_fl = all_states.dem_share[all_states.state == 'FL']\n\n\nnp.var(dem_share_fl)\n\n\nall_states_var = all_states[['state', 'total_votes', 'dem_votes',\n 'rep_votes', 'other_votes', 'dem_share']].groupby(['state']).var(ddof=0)\nall_states_var.dem_share.loc['FL']\n\n\nall_states_var.head()\n\n\n# #### Standard Deviation\n\n\nnp.std(dem_share_fl)\n\n\nnp.sqrt(np.var(dem_share_fl))\n\n\nall_states_std = all_states[['state', 'total_votes', 'dem_votes',\n 'rep_votes', 'other_votes', 'dem_share']].groupby(['state']).std(ddof=0)\nall_states_std.dem_share.loc['FL']\n\n\nall_states_std.head()\n\n\n# ### Computing the variance\n#\n# It is important to have some understanding of what commonly-used functions are doing under the hood. Though you may already know how to compute variances, this is a beginner course that does not assume so. In this exercise, we will explicitly compute the variance of the petal length of Iris veriscolor using the equations discussed in the videos. We will then use `np.var()` to compute it.\n#\n# **Instructions**\n#\n# * Create an array called differences that is the `difference` between the petal lengths (`versicolor_petal_length`) and the mean petal length. The variable `versicolor_petal_length` is already in your namespace as a NumPy array so you can take advantage of NumPy's vectorized operations.\n# * Square each element in this array. For example, `x**2` squares each element in the array `x`. Store the result as `diff_sq`.\n# * Compute the mean of the elements in `diff_sq` using `np.mean()`. Store the result as `variance_explicit`.\n# * Compute the variance of `versicolor_petal_length` using `np.var()`. Store the result as `variance_np`.\n# * Print both `variance_explicit` and `variance_np` in one `print` call to make sure they are consistent.\n\n\n# Array of differences to mean: differences\ndifferences = versicolor_petal_length - np.mean(versicolor_petal_length)\n\n# Square the differences: diff_sq\ndiff_sq = differences**2\n\n# Compute the mean square difference: variance_explicit\nvariance_explicit = np.mean(diff_sq)\n\n# Compute the variance using NumPy: variance_np\nvariance_np = np.var(versicolor_petal_length)\n\n# Print the results\nprint(variance_explicit, variance_np)\n\n\n# ### The standard deviation and the variance\n#\n# As mentioned in the video, the standard deviation is the square root of the variance. You will see this for yourself by computing the standard deviation using `np.std()` and comparing it to what you get by computing the variance with `np.var()` and then computing the square root.\n#\n# **Instructions**\n#\n# * Compute the variance of the data in the `versicolor_petal_length` array using `np.var()` and store it in a variable called `variance`.\n# * Print the square root of this value.\n# * Print the standard deviation of the data in the `versicolor_petal_length` array using `np.std()`\n\n\n# Compute the variance: variance\nvariance = np.var(versicolor_petal_length)\n\n# Print the square root of the variance\nstd_explicit = np.sqrt(variance)\n\n# Print the standard deviation\nstd_np = np.std(versicolor_petal_length)\n\nprint(std_explicit, std_np)\n\n\n# ## Covariance and Pearson correlation coefficient\n#\n# ![covariance](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/covariance.JPG)\n#\n# * Covariance\n# * $$covariance = \\frac{1}{n}\\sum_{i=1}^{n}(x_{i} - \\overline{x})(y_{i} - \\overline{y})$$\n# * The data point differs from the mean vote share and the mean total votes for Obama\n# * The differences for each data point can be computed\n# * The covariance is the mean of the product of these differences\n# * If both x and y tend to be above or below their respective means together, as they are in this data set, the covariance is positive.\n# * This means they are positively correlated:\n# * When x is high, so is y\n# * When the county is populous, it has more votes for Obama\n# * If x is high while y is low, the covariance is negative\n# * This means they are negatively correlated (anticorrelated) - not the case for this data set.\n#\n# ![pearson](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/pearson.JPG)\n#\n# * Pearson correlation\n# * A more generally applicable measure of how two variables depend on each other, should be dimensionless (not units).\n# * $$\\rho = Pearson\\space correlation = \\frac{covariance}{(std\\space of\\space x)(std\\space of\\space y)}$$\n# * $$\\rho = \\frac{variability\\space due\\space to\\space codependence}{independent\\space variability}$$\n# * Comparison of the variability in the data due to codependence (the covariance) to the variability inherent to each variable independently (their standard deviations).\n# * It's dimensionless and ranges from -1 (for complete anticorrelation) to 1 (for complete correlation).\n# * A value of zero means there is no correlation between the data, as shown in the upper left plot.\n# * Good metric for correlation between two variables.\n\n\nplt.figure(figsize=(10, 8))\nsns.scatterplot(x='total_votes', y='dem_share', data=swing, hue='state')\nplt.xlabel('total votes')\nplt.ylabel('% of vote for Obama')\nplt.xticks([x for x in range(0, 1000000, 100000)], rotation=40)\nplt.yticks([x for x in range(0, 100, 10)])\n\n# Create a Rectangle patch\nplt.gca().add_patch(Rectangle((400000, 52), 500000, 34,\n linewidth=1, edgecolor='b', facecolor='none'))\n\nplt.gca().add_patch(Rectangle((0, 5), 50000, 45,\n linewidth=1, edgecolor='r', facecolor='none'))\n\n# Annotate\nplt.annotate('12 largest counties; most vote for Obama', xy=(650000, 52), weight='bold',\n xytext=(400000, 35), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='b'))\n\nplt.annotate('small counties; most vote for McCain', xy=(50000, 20), weight='bold',\n xytext=(150000, 7), fontsize=10, arrowprops=dict(arrowstyle=\"->\", color='r'))\n\nplt.show()\n\n\n# ### Scatter plots\n#\n# When you made bee swarm plots, box plots, and ECDF plots in previous exercises, you compared the petal lengths of different species of iris. But what if you want to compare two properties of a single species? This is exactly what we will do in this exercise. We will make a **scatter plot** of the petal length and width measurements of Anderson's Iris versicolor flowers. If the flower scales (that is, it preserves its proportion as it grows), we would expect the length and width to be correlated.\n#\n# For your reference, the code used to produce the scatter plot in the video is provided below:\n#\n# ```python\n# _ = plt.plot(total_votes/1000, dem_share, marker='.', linestyle='none')\n# _ = plt.xlabel('total votes (thousands)')\n# _ = plt.ylabel('percent of vote for Obama')\n# ```\n#\n# **Instructions**\n#\n# * Use `plt.plot()` with the appropriate keyword arguments to make a scatter plot of versicolor petal length (x-axis) versus petal width (y-axis). The variables `versicolor_petal_length` and `versicolor_petal_width` are already in your namespace. Do not forget to use the `marker='.'` and `linestyle='none'` keyword arguments.\n# * Label the axes.\n# * Display the plot.\n\n\nversicolor_petal_width = iris_df['petal width (cm)'][iris_df.species ==\n 'versicolour']\n\n# Make a scatter plot\n_ = plt.plot(versicolor_petal_length, versicolor_petal_width,\n marker='.', linestyle='none')\n\n# Label the axes\n_ = plt.xlabel('petal length (cm)')\n_ = plt.ylabel('petal width (cm)')\n\n# Show the result\nplt.show()\n\n\n# ### Variance and covariance by looking\n#\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/var_covar_by_looking.JPG)\n#\n# Consider four scatter plots of x-y data, appearing to the right. Which has, respectively,\n#\n# * the highest variance in the variable x,\n# * the highest covariance,\n# * negative covariance?\n#\n# **Instructions**\n#\n# Possible Answers\n# * ~~a, c, b~~\n# * ~~d, c, a~~\n# * __**d, c, b**__\n# * ~~d, d, b~~\n\n# ### Computing the covariance\n#\n# The covariance may be computed using the Numpy function `np.cov()`. For example, we have two sets of data `x` and `y`, `np.cov(x, y)` returns a 2D array where entries `[0,1]` and `[1,0]` are the covariances. Entry `[0,0]` is the variance of the data in x, and entry `[1,1]` is the variance of the data in `y`. This 2D output array is called the covariance matrix, since it organizes the self- and covariance.\n#\n# To remind you how the *I. versicolor* petal length and width are related, we include the scatter plot you generated in a previous exercise.\n#\n# **Instructions**\n#\n# * Use `np.cov()` to compute the covariance matrix for the petal length (`versicolor_petal_length`) and width (`versicolor_petal_width`) of *I. versicolor*.\n# * Print the covariance matrix.\n# * Extract the covariance from entry `[0,1]` of the covariance matrix. Note that by symmetry, entry `[1,0]` is the same as entry `[0,1]`.\n# * Print the covariance.\n\n\niris_df[['petal length (cm)', 'petal width (cm)']\n ][iris_df.species == 'versicolour'].cov()\n\n\n# Compute the covariance matrix: covariance_matrix\ncovariance_matrix = np.cov(versicolor_petal_length, versicolor_petal_width)\n\n# Print covariance matrix\ncovariance_matrix\n\n\n# Extract covariance of length and width of petals: petal_cov\npetal_cov = covariance_matrix[0, 1]\n\n# Print the length/width covariance\npetal_cov\n\n\n# ### Computing the Pearson correlation coefficient\n#\n# As mentioned in the video, the Pearson correlation coefficient, also called the Pearson r, is often easier to interpret than the covariance. It is computed using the `np.corrcoef()` function. Like `np.cov()`, it takes two arrays as arguments and returns a 2D array. Entries `[0,0]` and `[1,1]` are necessarily equal to 1 (can you think about why?), and the value we are after is entry `[0,1]`.\n#\n# In this exercise, you will write a function, `pearson_r(x, y)` that takes in two arrays and returns the Pearson correlation coefficient. You will then use this function to compute it for the petal lengths and widths of *I. versicolor*.\n#\n# Again, we include the scatter plot you generated in a previous exercise to remind you how the petal width and length are related.\n#\n# **Instructions**\n#\n# * Define a function with signature `pearson_r(x, y)`.\n# * Use `np.corrcoef()` to compute the correlation matrix of `x` and `y` (pass them to `np.corrcoef()` in that order).\n# * The function returns entry `[0,1]` of the correlation matrix.\n# * Compute the Pearson correlation between the data in the arrays `versicolor_petal_length` and `versicolor_petal_width`. Assign the result to `r`.\n# * Print the result.\n\n\niris_df[['petal length (cm)', 'petal width (cm)']\n ][iris_df.species == 'versicolour'].corr()\n\n\ndef pearson_r(x, y):\n \"\"\"Compute Pearson correlation coefficient between two arrays.\"\"\"\n # Compute correlation matrix: corr_mat\n corr_mat = np.corrcoef(x, y)\n\n # Return entry [0,1]\n return corr_mat[0, 1]\n\n\n# Compute Pearson correlation coefficient for I. versicolor: r\nr = pearson_r(versicolor_petal_length, versicolor_petal_width)\n\n# Print the result\nprint(r)\n\n\n# # Thinking probabilistically: Discrete variables\n#\n# Statistical inference rests upon probability. Because we can very rarely say anything meaningful with absolute certainty from data, we use probabilistic language to make quantitative statements about data. In this chapter, you will learn how to think probabilistically about discrete quantities, those that can only take certain values, like integers. It is an important first step in building the probabilistic language necessary to think statistically.\n\n# ## Probabilistic logic and statistical inference\n#\n# * Probabilistic reasoning allows us to describe uncertainty\n# * Given a set of data, you describe probabilistically what you might expect if those data were acquired repeatedly\n# * This is the heart of statistical inference\n# * It's the process by which we go from measured data to probabilistic conclusions about what we might expect if we collected the same data again.\n\n# ### What is the goal of statistical inference?\n#\n# Why do we do statistical inference?\n#\n# **Possible Answers**\n#\n# * To draw probabilistic conclusions about what we might expect if we collected the same data again.\n# * To draw actionable conclusions from data.\n# * To draw more general conclusions from relatively few data or observations.\n# * __**All of these.**__\n\n# ### Why do we use the language of probablility?\n#\n# Which of the following is not a reason why we use probabilistic language in statistical inference?\n#\n# **Possible Answers**\n#\n# * Probability provides a measure of uncertainty.\n# * __**Probabilistic language is not very precise.**__\n# * Data are almost never exactly the same when acquired again, and probability allows us to say how much we expect them to vary.\n\n# ## Random number generators and hacker statistics\n#\n# * Instead o repeating data acquisition over and over, repeated measurements can be simulated\n# * The concepts of probabilities originated from games of chance\n# * What's the probability of getting 4 heads with 4 flips of a coin?\n# * This type of data can be generated using `np.random.random`\n# * drawn a number between 0 and 1\n# * $<0.5\\longrightarrow\\text{heads}$\n# * $\\geq0.5\\longrightarrow\\text{tails}$\n# * The pseudo random number generator works by starting with an integer, called a seed, and then generates random numbers in succession\n# * The same seed gives the same sequence of random numbers\n# * Manually seed the random number generator for reproducible results\n# * Specified using `np.random.seed()`\n\n# #### Bernoulli Trial\n#\n# * An experiment that has two options, \"success\" (True) and \"failure\" (False).\n\n# #### Hacker stats probabilities\n#\n# * Determine how to simulate data\n# * Simulated it repeatedly\n# * Compute the fraction of trials that had the outcome of interest\n# * Probability is approximately the fraction of trials with the outcome of interest\n\n# #### Simulated coin flips\n\n\nnp.random.seed(42)\nrandom_numbers = np.random.random(size=4)\n\nrandom_numbers\n\n\nheads = random_numbers < 0.5\n\nheads\n\n\nnp.sum(heads)\n\n\n# * The number of heads can be computed by summing the array of Booleans, because in numerical contexts, Python treats True as 1 and False as 0.\n# * We want to know the probability of getting four heads if we were to repeatedly flip the 4 coins\n\n# * without `list comprehension`\n#\n# ```python\n# n_all_heads = 0 # initialize number of 4-heads trials\n#\n# for _ in range(10000):\n# heads = np.random.random(size=4) < 0.5\n# n_heads = np.sum(heads)\n# if n_heads == 4:\n# n_all_heads += 1\n# ```\n#\n# * with `list comprehension`\n\n\nn_all_heads = sum([1 for _ in range(10000) if sum(\n np.random.random(size=4) < 0.5) == 4])\n\n\nn_all_heads\n\n\nn_all_heads/10000\n\n\n# ### Generating random numbers using the `np.random` module\n#\n# We will be hammering the `np.random` module for the rest of this course and its sequel. Actually, you will probably call functions from this module more than any other while wearing your hacker statistician hat. Let's start by taking its simplest function, `np.random.random()` for a test spin. The function returns a random number between zero and one. Call `np.random.random()` a few times in the IPython shell. You should see numbers jumping around between zero and one.\n#\n# In this exercise, we'll generate lots of random numbers between zero and one, and then plot a histogram of the results. If the numbers are truly random, all bars in the histogram should be of (close to) equal height.\n#\n# You may have noticed that, in the video, Justin generated 4 random numbers by passing the keyword argument `size=4` to `np.random.random()`. Such an approach is more efficient than a `for` loop: in this exercise, however, you will write a `for` loop to experience hacker statistics as the practice of repeating an experiment over and over again.\n#\n# **Instructions**\n#\n# * Seed the random number generator using the seed `42`.\n# * Initialize an empty array, `random_numbers`, of 100,000 entries to store the random numbers. Make sure you use `np.empty(100000)` to do this.\n# * Write a `for` loop to draw 100,000 random numbers using `np.random.random()`, storing them in the `random_numbers` array. To do so, loop over `range(100000)`.\n# * Plot a histogram of `random_numbers`. It is not necessary to label the axes in this case because we are just checking the random number generator. Hit 'Submit Answer' to show your plot.\n\n\n# Seed the random number generator\nnp.random.seed(42)\n\n# Initialize random numbers: random_numbers\nrandom_numbers = np.empty(100000)\n\n# Generate random numbers by looping over range(100000)\nfor i in range(100000):\n random_numbers[i] = np.random.random()\n\n# Plot a histogram\n_ = plt.hist(random_numbers)\n\n# Show the plot\nplt.show()\n\n\nsns.distplot(random_numbers)\nplt.show()\n\n\n# **The histogram is nearly flat across the top, indicating there is equal chance a randomly-generated number is in any of the histogram bins.**\n#\n# * [Generating Random Numbers With `NumPy`](https://chrisalbon.com/python/basics/generating_random_numbers_with_numpy/)\n\n# #### Using `np.random.rand`\n\n\nrand_num = np.random.rand(100000)\n\n\nsns.distplot(rand_num)\nplt.show()\n\n\n# ### The np.random module and Bernoulli trials\n#\n# You can think of a Bernoulli trial as a flip of a possibly biased coin. Specifically, each coin flip has a probability ***p*** of landing heads (success) and probability ***1\u2212p*** of landing tails (failure). In this exercise, you will write a function to perform `n` [Bernoulli trials](https://en.wikipedia.org/wiki/Bernoulli_trial), `perform_bernoulli_trials(n, p)`, which returns the number of successes out of `n` Bernoulli trials, each of which has probability `p` of success. To perform each Bernoulli trial, use the `np.random.random()` function, which returns a random number between zero and one.\n#\n# **Instructions**\n#\n# * Define a function with signature `perform_bernoulli_trials(n, p)`.\n# * Initialize to zero a variable `n_success` the counter of `True` occurrences, which are Bernoulli trial successes.\n# * Write a `for` loop where you perform a Bernoulli trial in each iteration and increment the number of success if the result is `True`. Perform `n` iterations by looping over `range(n)`.\n# * To perform a Bernoulli trial, choose a random number between zero and one using `np.random.random()`. If the number you chose is less than `p`, increment n_success (use the `+= 1` operator to achieve this).\n# * The function returns the number of successes `n_success`.\n\n# #### def perform_bernoulli_trials()\n\n\ndef perform_bernoulli_trials(n: int = 100000, p: float = 0.5) -> int:\n \"\"\"\n Perform n Bernoulli trials with success probability p\n and return number of successes.\n n: number of iterations\n p: target number between 0 and 1, inclusive\n \"\"\"\n # Initialize number of successes: n_success\n n_success = 0\n\n # Perform trials\n for i in range(n):\n # Choose random number between zero and one: random_number\n random_number = np.random.random()\n\n # If less than p, it's a success so add one to n_success\n if random_number < p:\n n_success += 1\n\n return n_success\n\n\n# ##### With `list comprehension`\n\n\ndef perform_bernoulli_trials(n: int = 100000, p: float = 0.5) -> int:\n \"\"\"\n Perform n Bernoulli trials with success probability p\n and return number of successes.\n n: number of iterations\n p: target number between 0 and 1, inclusive\n \"\"\"\n\n return sum([1 for _ in range(n) if np.random.random() < p])\n\n\n# ### How many defaults might we expect?\n#\n# Let's say a bank made 100 mortgage loans. It is possible that anywhere between 0 and 100 of the loans will be defaulted upon. You would like to know the probability of getting a given number of defaults, given that the probability of a default is `p = 0.05`. To investigate this, you will do a simulation. You will perform 100 Bernoulli trials using the `perform_bernoulli_trials()` function you wrote in the previous exercise and record how many defaults we get. Here, a success is a default. (Remember that the word \"success\" just means that the Bernoulli trial evaluates to `True`, i.e., did the loan recipient default?) You will do this for another 100 Bernoulli trials. And again and again until we have tried it 1000 times. Then, you will plot a histogram describing the probability of the number of defaults.\n#\n# **Instructions**\n#\n# * Seed the random number generator to 42.\n# * Initialize `n_defaults`, an empty array, using `np.empty()`. It should contain 1000 entries, since we are doing 1000 simulations.\n# * Write a `for` loop with `1000` iterations to compute the number of defaults per 100 loans using the `perform_bernoulli_trials()` function. It accepts two arguments: the number of trials `n` - in this case 100 - and the probability of success `p` - in this case the probability of a default, which is `0.05`. On each iteration of the loop store the result in an entry of `n_defaults`.\n# * Plot a histogram of `n_defaults`. Include the `normed=True` keyword argument so that the height of the bars of the histogram indicate the probability.\n\n\n# Seed random number generator\nnp.random.seed(42)\n\n# Initialize the number of defaults: n_defaults\nn_defaults = np.empty(1000)\n\n# Compute the number of defaults\nfor i in range(1000):\n n_defaults[i] = perform_bernoulli_trials(100, 0.05)\n\n\n# Plot the histogram with default number of bins; label your axes\n_ = plt.hist(n_defaults, density=True)\n_ = plt.xlabel('number of defaults out of 100 loans')\n_ = plt.ylabel('probability')\n\n# Show the plot\nplt.show()\n\n\n# **This is not an optimal way to plot a histogram when the results are known to be integers. This will be revisited in forthcoming exercises.**\n\n# #### With `list comprehension`\n\n\nnp.random.seed(42)\nn_defaults = np.asarray([perform_bernoulli_trials(100, 0.05)\n for _ in range(1000)])\n\nplt.hist(n_defaults, density=True)\nplt.xlabel('number of defaults out of 100 loans')\nplt.ylabel('probability')\nplt.show()\n\n\n# ### Will the bank fail?\n#\n# Using `def ecdf()` from the first section, plot the number of `n_defaults` from the previous exercise, as a CDF.\n#\n# If interest rates are such that the bank will lose money if 10 or more of its loans are defaulted upon, what is the probability that the bank will lose money?\n#\n# **Instructions**\n#\n# * Compute the `x` and `y` values for the ECDF of `n_defaults`.\n# * Plot the ECDF, making sure to label the axes. Remember to include `marker='.'` and `linestyle='none'` in addition to `x` and `y` in your call `plt.plot()`.\n# * Show the plot.\n# * Compute the total number of entries in your `n_defaults` array that were greater than or equal to 10. To do so, compute a boolean array that tells you whether a given entry of `n_defaults` is `>= 10`. Then sum all the entries in this array using `np.sum()`. For example, `np.sum(n_defaults <= 5)` would compute the number of defaults with 5 or *fewer* defaults.\n# * The probability that the bank loses money is the fraction of `n_defaults` that are greater than or equal to 10.\n\n\n# Compute ECDF: x, y\nx, y = ecdf(n_defaults)\n\n# Plot the ECDF with labeled axes\nplt.plot(x, y, marker='.', linestyle='none')\nplt.xlabel('Number of Defaults out of 100')\nplt.ylabel('CDF')\n\n# Show the plot\nplt.show()\n\n# Compute the number of 100-loan simulations with 10 or more defaults: n_lose_money\nn_lose_money = sum(n_defaults >= 10)\n\n# Compute and print probability of losing money\nprint('Probability of losing money =', n_lose_money / len(n_defaults))\n\n\n# **As might be expected, about 5/100 defaults occur. There's about a 2% chance of getting 10 or more defaults out of 100 loans.**\n\n# ## Probability distributions and stories: The Binomial distribution\n#\n# * [Probability Distributions in Python](https://www.datacamp.com/community/tutorials/probability-distributions-python)\n\n# #### Probability Mass Function (PMF)\n#\n# * [Probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)\n# * The set of probabilities of discrete outcomes\n# * PMF is a property of a discrete probability distribution\n\n# #### Discrete Uniform PMF\n#\n# * ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/discrete_uniform_pmf.JPG)\n# * The outcomes are discrete because only certain values may be attained; there is not option for 3.7\n# * Each result has a uniform probability of 1/6\n\n# #### Probability Distribution\n#\n# * [Probability distribution](https://en.wikipedia.org/wiki/Probability_distribution)\n# * A mathematical description of outcomes\n\n# #### Discrete Uniform Distribution\n#\n# * [Discrete uniform distribution](https://en.wikipedia.org/wiki/Discrete_uniform_distribution)\n# * The outcome of rolling a single fair die, is Discrete Uniformly distributed\n\n# #### Binomial Distribution\n#\n# * [Binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution)\n# * The number ***r*** of successes in ***n*** Bernoulli trials with probability ***p*** of success, is Binomially distributed\n# * The number ***r*** of heads in 4 coin flips with probability ***p = 0.5*** of heads, is Binomially distributed\n\n\nnp.random.binomial(4, 0.5)\n\n\nnp.random.binomial(4, 0.5, size=10)\n\n\n# ##### Binomial PMF\n#\n# * To plot the Binomial PMF, take 10000 samples from a Binomial distribution of 60 Bernoulli trials with a probability of success of 0.1\n# * The most likely number of successes is 6 out of 60, but it's possible to get as many as 11 or as few as 1\n# * [`scipy.stats.binom`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom.html)\n\n\nnp.random.seed(42)\nsamples = np.random.binomial(60, 0.1, size=10_000)\nsamples\n\n\n# ![](https://raw.githubusercontent.com/trenton3983/DataCamp/master/Images/statistical_thinking_1/binomial_pmf.JPG)\n\n\nn, p = 60, 0.1\nx = [x for x in range(17)]\n\nfig, ax = plt.subplots(1, 1)\nax.plot(x, binom.pmf(x, n, p), 'bo', ms=5, label='binom pmf')\nax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=3, alpha=0.5)\nplt.xticks(x)\nplt.ylabel('probability')\nplt.xlabel('number of successes')\nplt.show()\n\n\nsns.set()\nx, y = ecdf(samples)\n\nplt.plot(x, y, marker='.', linestyle='none')\nplt.margins(0.02)\nplt.xlabel('Number of Successes')\nplt.ylabel('CDF')\nplt.show()\n\n\n# ### Sampling out of the Binomial distribution\n#\n# Compute the probability mass function for the number of defaults we would expect for 100 loans as in the last section, but instead of simulating all of the Bernoulli trials, perform the sampling using `np.random.binomial()`. This is identical to the calculation you did in the last set of exercises using your custom-written `perform_bernoulli_trials()` function, but far more computationally efficient. Given this extra efficiency, we will take 10,000 samples instead of 1000. After taking the samples, plot the CDF as last time. This CDF that you are plotting is that of the Binomial distribution.\n#\n# Note: For this exercise and all going forward, the random number generator is pre-seeded for you (with `np.random.seed(42)`) to save you typing that each time.\n#\n# **Instructions**\n#\n# * Draw samples out of the Binomial distribution using `np.random.binomial()`. You should use parameters `n = 100` and `p = 0.05`, and set the `size = 10000`.\n# * Compute the CDF using your previously-written `ecdf()` function.\n# * Plot the CDF with axis labels. The x-axis here is the ***number of defaults out of 100 loans***, while the y-axis is the ***CDF***.\n\nnp.random.seed(42)\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "intent": "# Take 10,000 samples out of the binomial distribution: n_defaults"}, {"original_comment": "# labels and legend\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Gaussian Process Regression\n#\n# At times you don't care about the underlying model for your data points and just want a model that describes the data. One such fitting technique is know as Gaussian process regression (also know as kriging). This kind of regression assumes all the data points are drawn from a common covariance function. This function is used to generate an (infinite) set of functions and only keeps the ones that pass through the observed data.\n#\n# ## Packages being used\n# + `pymc3`: has a Gaussian process regression function\n#\n# ## Relevant documentation\n# + `pymc3`: https://docs.pymc.io/notebooks/GP-MeansAndCovs.html, https://docs.pymc.io/notebooks/GP-Marginal.html\n\n#%%\n\nimport warnings\nimport numpy as np\nimport pymc3 as pm\nimport theano.tensor as tt\nfrom scipy import interpolate\nimport seaborn\nfrom matplotlib import pyplot as plt\nimport mpl_style\nget_ipython().run_line_magic('matplotlib', 'inline')\nplt.style.use(mpl_style.style1)\nseaborn.axes_style(mpl_style.style1)\n\n#%%\n\nwarnings.simplefilter('ignore')\n\n\n# ## The squared exponential covariance (or Radial-basis function or Exponential Quadratic)\n# As an example we will use the squared exponential covariance function:\n# $$ \\operatorname{Cov}{(x_1, x_2; h)} = \\exp{\\left( \\frac{-(x_1 - x_2)^2}{2h^2} \\right)} $$\n# Lets using this function to draw some _unconstrained_ functions:\n\n#%%\n\nh = 1\ncov = pm.gp.cov.ExpQuad(1, h)\n\nx = np.linspace(1, 10, 500)[:, None]\nK = cov(x).eval()\n\nplt.figure(1, figsize=(18, 8))\n\nplt.subplot(121)\nplt.plot(x, pm.MvNormal.dist(mu=np.zeros(K.shape[0]), cov=K).random(size=6).T)\nplt.xlabel('x')\nplt.ylabel('f(x)')\n\nplt.subplot(122)\nplt.imshow(K, interpolation='none', origin='upper', extent=[0, 10, 10, 0])\nplt.colorbar()\nplt.tight_layout()\n\n\n# ## Constrain the model\n#\n# Assume we have some data points, we can use Gaussian process regression to only pick the models that pass through those points:\n\n#%%\n\nx1 = np.array([1, 3, 5, 6, 7, 8])\ny1 = x1 * np.sin(x1)\n\n\n# ### Build the PYMC model\n# We will define priors for the length scale `h` and the leading scaling coefficient `c`. We will assume there is a small level of equal but unknown noise associated with each data point.\n\n#%%\n\nX = x1[:, None]\nwith pm.Model() as model:\n h = pm.Gamma(\"h\", alpha=2, beta=1)\n c = pm.HalfCauchy(\"c\", beta=5)\n cov = c**2 * pm.gp.cov.ExpQuad(1, ls=h)\n gp = pm.gp.Marginal(cov_func=cov)\n noise = pm.HalfCauchy(\"noise\", beta=0.1)\n y_fit = gp.marginal_likelihood(\"y_{fit}\", X=X, y=y1, noise=noise)\n\ndisplay(model)\ndisplay(pm.model_to_graphviz(model))\n\n\n# Find the maximum of the likelihood using the `find_MAP` function.\n\n#%%\n\nwith model:\n mp = pm.find_MAP()\n\ndisplay(\n 'Best fit kernel: {0:.2f}**2 * ExpQuad(ls={1:.2f})'.format(mp['c'], mp['h']))\n\n\n# ### Use the fit to interpolate to new `X` values\n# This `MAP` fit can be used to interpolate and extrapolate to a new grid of points. PYMC offers the `predict` method to make this easier.\n\n#%%\n\nn_new = 500\nX_new = np.linspace(0, 10, n_new)\n\nmu, var = gp.predict(X_new[:, None], point=mp, diag=True)\nsd = np.sqrt(var)\n\n\n# Let's plot the result:\n\n#%%\n\nplt.figure(2, figsize=(10, 8))\nplt.plot(x1, y1, 'ok', label='observed')\n\nplt.plot(\n X_new,\n X_new * np.sin(X_new),\n '--',\n color='C1',\n label='True'\n)\n\nplt.plot(\n X_new.flatten(),\n mu,\n color='C0',\n lw=3,\n zorder=3,\n label='prediction'\n)\n\n# plot 95% best fit region\nplt.fill_between(\n X_new.flatten(),\n mu - 1.96*sd,\n mu + 1.96*sd,\n color='C0',\n alpha=0.3,\n zorder=1,\n label='95% confidence interval'\n)", "target_code": "plt.xlabel('x')\nplt.ylabel('f(x)')\nplt.ylim(-6, 12)\nplt.legend(loc='upper left', ncol=2)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Gaussian Process Regression\n#\n# At times you don't care about the underlying model for your data points and just want a model that describes the data. One such fitting technique is know as Gaussian process regression (also know as kriging). This kind of regression assumes all the data points are drawn from a common covariance function. This function is used to generate an (infinite) set of functions and only keeps the ones that pass through the observed data.\n#\n# ## Packages being used\n# + `pymc3`: has a Gaussian process regression function\n#\n# ## Relevant documentation\n# + `pymc3`: https://docs.pymc.io/notebooks/GP-MeansAndCovs.html, https://docs.pymc.io/notebooks/GP-Marginal.html\n\n\nimport warnings\nimport numpy as np\nimport pymc3 as pm\nimport theano.tensor as tt\nfrom scipy import interpolate\nimport seaborn\nfrom matplotlib import pyplot as plt\nimport mpl_style\nget_ipython().run_line_magic('matplotlib', 'inline')\nplt.style.use(mpl_style.style1)\nseaborn.axes_style(mpl_style.style1)\n\n\nwarnings.simplefilter('ignore')\n\n\n# ## The squared exponential covariance (or Radial-basis function or Exponential Quadratic)\n# As an example we will use the squared exponential covariance function:\n# $$ \\operatorname{Cov}{(x_1, x_2; h)} = \\exp{\\left( \\frac{-(x_1 - x_2)^2}{2h^2} \\right)} $$\n# Lets using this function to draw some _unconstrained_ functions:\n\n\nh = 1\ncov = pm.gp.cov.ExpQuad(1, h)\n\nx = np.linspace(1, 10, 500)[:, None]\nK = cov(x).eval()\n\nplt.figure(1, figsize=(18, 8))\n\nplt.subplot(121)\nplt.plot(x, pm.MvNormal.dist(mu=np.zeros(K.shape[0]), cov=K).random(size=6).T)\nplt.xlabel('x')\nplt.ylabel('f(x)')\n\nplt.subplot(122)\nplt.imshow(K, interpolation='none', origin='upper', extent=[0, 10, 10, 0])\nplt.colorbar()\nplt.tight_layout()\n\n\n# ## Constrain the model\n#\n# Assume we have some data points, we can use Gaussian process regression to only pick the models that pass through those points:\n\n\nx1 = np.array([1, 3, 5, 6, 7, 8])\ny1 = x1 * np.sin(x1)\n\n\n# ### Build the PYMC model\n# We will define priors for the length scale `h` and the leading scaling coefficient `c`. We will assume there is a small level of equal but unknown noise associated with each data point.\n\n\nX = x1[:, None]\nwith pm.Model() as model:\n h = pm.Gamma(\"h\", alpha=2, beta=1)\n c = pm.HalfCauchy(\"c\", beta=5)\n cov = c**2 * pm.gp.cov.ExpQuad(1, ls=h)\n gp = pm.gp.Marginal(cov_func=cov)\n noise = pm.HalfCauchy(\"noise\", beta=0.1)\n y_fit = gp.marginal_likelihood(\"y_{fit}\", X=X, y=y1, noise=noise)\n\ndisplay(model)\ndisplay(pm.model_to_graphviz(model))\n\n\n# Find the maximum of the likelihood using the `find_MAP` function.\n\n\nwith model:\n mp = pm.find_MAP()\n\ndisplay(\n 'Best fit kernel: {0:.2f}**2 * ExpQuad(ls={1:.2f})'.format(mp['c'], mp['h']))\n\n\n# ### Use the fit to interpolate to new `X` values\n# This `MAP` fit can be used to interpolate and extrapolate to a new grid of points. PYMC offers the `predict` method to make this easier.\n\n\nn_new = 500\nX_new = np.linspace(0, 10, n_new)\n\nmu, var = gp.predict(X_new[:, None], point=mp, diag=True)\nsd = np.sqrt(var)\n\n\n# Let's plot the result:\n\n\nplt.figure(2, figsize=(10, 8))\nplt.plot(x1, y1, 'ok', label='observed')\n\nplt.plot(\n X_new,\n X_new * np.sin(X_new),\n '--',\n color='C1',\n label='True'\n)\n\nplt.plot(\n X_new.flatten(),\n mu,\n color='C0',\n lw=3,\n zorder=3,\n label='prediction'\n)\n\n# plot 95% best fit region\nplt.fill_between(\n X_new.flatten(),\n mu - 1.96*sd,\n mu + 1.96*sd,\n color='C0',\n alpha=0.3,\n zorder=1,\n label='95% confidence interval'\n)\n", "project_metadata": {"full_name": "CKrawczyk/jupyter_data_languages", "description": null, "topics": [], "git_url": "git://github.com/CKrawczyk/jupyter_data_languages.git", "stars": 8, "watchers": 8, "forks": 6, "created": "2016-09-30T15:06:10Z", "size": 25730, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2035356, "Python": 1225}, "last_updated": "2020-10-30T11:39:15Z"}, "intent": "# labels and legend"}, {"original_comment": "# use fsolve to find the roots of the equation\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Differential Equations\n\n# ## Important libraries\n\n#%%\n\nfrom scipy.integrate import odeint\nfrom scipy.optimize import fsolve\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Read and use LaTeX command\nplt.rc('text', usetex=True)\n\n# font's type\nplt.rc('font', family='serif')\n\n# enable the inline backend for usage with the IPython Notebook\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Style Sheets from Matplotlib\n\n# [_here_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n#\n# ```Python\n# plt.style.use('ggplot') #a style to use in everyplot\n#\n# with plt.style.context('ggplot'): #to use in a specific plot\n# plt.plot(...\n# ```\n\n# ## Direction Field + Solution\n\n# **Differential Equation:**\n#\n# $$x^{2}\\frac{dy}{dx} - 2xy = 3y^{4}$$\n#\n# **Solution:**\n#\n# $$y^{-3} = -\\frac{9}{5}x^{-1} + \\frac{49}{5}x^{-6}$$\n# **or**\n# $$y(x) = \\frac{1}{\\sqrt[3]{\\dfrac{9.8-\\dfrac{9x^{5}}{5}}{x^{6}}}}$$\n#\n\n#%%\n\n# ---------------------- DIRECTION FIELD -------------------------\n# domain points for the vector field (direction field)\nX_dirField = np.linspace(0.005, 2, 15)\nY_dirField = np.linspace(0.005, 4, 15)\n\n# position in the 2D plane\nX_dirField, Y_dirField = np.meshgrid(X_dirField, Y_dirField)\n\n# direction of our vectors\nU = 1\nV = (3*(Y_dirField)**4 + 2*X_dirField*Y_dirField)/(X_dirField**2)\n\n# magnitude\nMagVec = np.sqrt(U**2 + V**2)\n\n# normalize them\nU = U/MagVec\nV = V/MagVec\n\n# --------------- SOLUTION CURVE -----------------------------------\n\n\ndef sol(X):\n return 1/(((9.8 - 1.8*(X**5))/(X**6))**(1/3))\n\n\n# solution points\nX_sol = np.linspace(0.05, 1.4, 100)\n# ------------------ PLOT TIME -------------------------------\n\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n\n# solution curve\nplt.plot(X_sol, sol(X_sol),\n color='#E78708',\n label=r'$y^{-3} = -\\frac{9}{5} x^{-1} + \\frac{49}{5}x^{-6}$')\n\n# direction field\nplt.quiver(X_dirField, Y_dirField,\n U, V,\n width=0.0018,\n headwidth=4,\n headlength=5,\n label='Campo Direccional')\n\n# Add title and label to the axes\nplt.title('Direction field $+$ solution curve', size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y(x)$', size=20, labelpad=10)\nplt.xlim(0, 2)\nplt.ylim(-0.1, 4)\n\n\n# add legend\nlegend = plt.legend(loc=0, prop={'size': 15})\nlegend.get_frame().set_color('#FFFFFF')\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Family of Curves\n\n# **Differential Equation**\n# $$x\\frac{dx}{dy} = \\sqrt{x^{2} + y^{2}}+y$$\n#\n# **General Solution**\n# $$y(x) = x\\sin\\big( \\ln(x) + C_{1} \\big)$$\n\n#%%\n\n# define our solution\ndef sol(x, c):\n return x*np.sin(c + np.log(x))\n\n\n# define the domaing\nX = np.linspace(0.001, 20, 150)\n\n# c1 values\nc1 = [-10, -5, 0, 5, 10]\ncolorplot = ['#182825', '#016FB9', '#22AED1', '#D81159', '#FBB13C']\n\n\n# label our curves\nnames = [r\"$C_{1} = -10$\", r\"$C_{1} = -5$\",\n r\"$C_{1} = 0$\", r\"$C_{1} = 5$\", r\"$C_{1} = 10$\"]\n\n# use and specific style sheet for this plot\nwith plt.style.context('seaborn-paper'):\n # create a figure\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n # use a for loop to plot all the curves\n for i in range(len(c1)):\n plt.plot(X, sol(X, c1[i]),\n color=colorplot[i],\n label=names[i]\n )\n\n # Add title and label to the axes\n plt.title(r'Solution curves for $x \\ \\frac{dx}{dy} = \\sqrt{x^{2} + y^{2}}+y$',\n size=25, pad=15)\n plt.xlabel('$x$', size=20, labelpad=10)\n plt.ylabel('$y(x)$', size=20, labelpad=10)\n\n # add legend\n legend = plt.legend(loc=0, prop={'size': 15})\n legend.get_frame().set_color('#FFFFFF')\n\n # increase the size of the ticks\n plt.tick_params(labelsize=15)\n\n plt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Orthogonal trajectories\n\n# **Family of curves**\n# $$y^{2}-x^{2} = c$$\n#\n# **Orthogonal trajectories**\n# $$y(x) = kx^{-1}$$\n#\n# **where $k$ and $c$ are integer parameters**\n\n#%%\n\n# --------------------------- FAMILY OF CURVES ---------------------\n# define our solution\ndef family(x, c):\n return np.sqrt(c + x**2)\n\n\n# define the domaing\nX = np.linspace(-10, 10, 150)\n\n# c1 values\nc_parameter = [10, 20, 30, 40, 50]\ncolorplot = ['#0290CC', '#022BCC', '#3E02CC', '#A302CC', '#CC0290']\n\n# create a figure\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n# use a for loop to plot all the curves\nfor i in range(len(c_parameter)):\n plt.plot(X, family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, -1*family(X, c_parameter[i]),\n color=colorplot[i],\n )\n\n\n# Add title and label to the axes\nplt.title(r'Family of curves',\n size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y$', size=20, labelpad=10)\n\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n#%%\n\n# --------------------------- FAMILY OF CURVES ---------------------\n# define function for the family\ndef family(x, c):\n return np.sqrt(c + x**2)\n\n\n# define the domaing\nX = np.linspace(-10, 10, 150)\n\n# c values\nc_parameter = [10, 20, 30, 40, 50]\ncolorplot = ['#0290CC', '#022BCC', '#3E02CC', '#A302CC', '#CC0290']\n\n# ---------------------- ORTHOGONAL TRAJECTORIES --------------------\n# same values for k and c\nk_parameter = c_parameter\n\n# define function for the orthogonal trajectories\n\n\ndef orthogonal(x, k):\n return k*(1/x)\n\n\northogonal_colors = ['#CC3E02', '#CCA302', '#90CC02', '#2BCC02', '#02CC3E']\n\n# ----------------- PLOT TIME ------------------------------------\n\n# create a figure\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n# use a for loop to plot all the curves\nfor i in range(len(c_parameter)):\n plt.plot(X, family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, -1*family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, orthogonal(X, k_parameter[i]),\n color=orthogonal_colors[i]\n )\n\n\n# Add title and label to the axes\nplt.title(r'Family of curves $+$ orthogonal trajectories',\n size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y$', size=20, labelpad=10)\n\n# set limits\nplt.ylim(-10, 10)\n\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Logistic Equation\n\n# **Logistic Equation (with inmigration)**\n# $$\\frac{dP}{dt} = P\\left( 1 - P \\right) + 0.3 e^{-P}$$\n\n# We're using this function to fin the roots of a function\n# ```Python\n# from scipy.optimize import fsolve\n# ```\n# [Documentation of `fsolve`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fsolve.html)\n\n# ### Phase portrait\n\n#%%\n\n# background color\nbackground = '#342B47'\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)\n\n\n# Domain\nP = np.linspace(-6, 6, 1000)\n\n# use fsolve to find the roots of the equation\nraices_dPdt = fsolve(f, [-5, -1, 1])\n\nwith plt.style.context('dark_background'):\n # creamos nuestra figura\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=background\n )\n ax = plt.axes()\n ax.set_facecolor(background)\n\n # plot g(P)\n plt.plot(P, f(P),\n color='#6BCBC3',\n linewidth=2,\n #label = 'Gr\u00e1fica 1'\n )\n # plot the equilibrium points\n plt.scatter(raices_dPdt, [0]*3,\n c='#ECE0DC',\n label='Equilibrium points (constant solutions)',\n s=50,\n alpha=1,\n )\n\n # color for the arrows\n color_arrows = '#5DDA52'\n\n # arrow pointing to P1\n plt.quiver([-5.5], [0], [2], [0],\n scale=20,\n color=color_arrows\n )\n # arrow pointing to P1\n plt.quiver([-3.1], [0], [-2], [0],\n scale=20,\n color=color_arrows\n )\n # arrow near to P2 but pointing away from it\n plt.quiver([-0.62], [0], [-2], [0],\n scale=20, color=color_arrows\n )\n # arrow near to P2 but pointing away from it but pointing to P3\n plt.quiver([0.025], [0], [2], [0],\n scale=20, color=color_arrows\n )\n # arrow pointing to P3\n plt.quiver([2.15], [0], [-2], [0],\n scale=20, color=color_arrows\n )\n\n # title and axes labels\n plt.title(r'Phase portrait of $\\frac{\\mathrm{d}P}{\\mathrm{d}t}= P \\left( 1 - P \\right) + 0.3 e^{-P} $',\n pad=10,\n size=25\n )\n plt.xlabel(r'$P$', size=20)\n plt.ylabel(r'$g(P)$', size=20)\n plt.tick_params(labelsize=15)\n\n # legend\n plt.legend(loc=0, prop={'size': 18}).get_frame().set_facecolor(background)\n\n # Limit the axes\n plt.ylim(-7, 5)\n plt.xlim(-6, 3)\n\n # add grid\n plt.grid(b=True, linestyle='--', alpha=0.3)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Behaviour\n\n#%%\n\n# background color\nbackground = '#342B47'\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)\n\n\n# domain\nP = np.linspace(-6, 6, 1000)\n\n# use fsolve to find the roots of the equation\nraices_dPdt = fsolve(f, [-5, -1, 1])\n\nwith plt.style.context('dark_background'):\n\n # create figure and choose background\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=background\n )\n\n ax = plt.axes()\n ax.set_facecolor(background)\n\n # add the constant solutions\n for i in range(3):\n plt.plot([0, 10], [raices_dPdt[i]]*2, linestyle='--', color='white')\n\n # add text\n plt.text(4, -5.35, 'Increases', fontsize=20)\n plt.text(4, -2.45, 'Decrases', fontsize=20)\n plt.text(4, 0.25, 'Increases', fontsize=20)\n plt.text(4, 2.35, 'Decreases', fontsize=20)\n\n # title and axes lables\n plt.title(r'Behaviour of $P(t)$',\n pad=10,\n size=25\n )\n plt.xlabel('$t$ \\n[years]', size=20)\n plt.ylabel('$P(t)$ \\n[thousand of inhabitants]', size=20)\n plt.tick_params(labelsize=15)\n\n # Limit the axes\n plt.xlim(-0.1, 10.1)\n plt.ylim(-6, 4)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Logistic Function\n\n# **Solution**\n#\n# Using numerical aproximations with `odeint`\n#\n# ```Python\n# from scipy.integrate import odeint\n# ```\n#\n# [Documentation of `odeint`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.odeint.html)\n\n#%%\n\n# background color\nbackground = '#342B47'\n\n# import numerical solution to ODE\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)", "target_code": "raices_dPdt = fsolve(f, [-5, -1, 1])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Differential Equations\n\n# ## Important libraries\n\n\nfrom scipy.integrate import odeint\nfrom scipy.optimize import fsolve\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Read and use LaTeX command\nplt.rc('text', usetex=True)\n\n# font's type\nplt.rc('font', family='serif')\n\n# enable the inline backend for usage with the IPython Notebook\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Style Sheets from Matplotlib\n\n# [_here_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n#\n# ```Python\n# plt.style.use('ggplot') #a style to use in everyplot\n#\n# with plt.style.context('ggplot'): #to use in a specific plot\n# plt.plot(...\n# ```\n\n# ## Direction Field + Solution\n\n# **Differential Equation:**\n#\n# $$x^{2}\\frac{dy}{dx} - 2xy = 3y^{4}$$\n#\n# **Solution:**\n#\n# $$y^{-3} = -\\frac{9}{5}x^{-1} + \\frac{49}{5}x^{-6}$$\n# **or**\n# $$y(x) = \\frac{1}{\\sqrt[3]{\\dfrac{9.8-\\dfrac{9x^{5}}{5}}{x^{6}}}}$$\n#\n\n\n# ---------------------- DIRECTION FIELD -------------------------\n# domain points for the vector field (direction field)\nX_dirField = np.linspace(0.005, 2, 15)\nY_dirField = np.linspace(0.005, 4, 15)\n\n# position in the 2D plane\nX_dirField, Y_dirField = np.meshgrid(X_dirField, Y_dirField)\n\n# direction of our vectors\nU = 1\nV = (3*(Y_dirField)**4 + 2*X_dirField*Y_dirField)/(X_dirField**2)\n\n# magnitude\nMagVec = np.sqrt(U**2 + V**2)\n\n# normalize them\nU = U/MagVec\nV = V/MagVec\n\n# --------------- SOLUTION CURVE -----------------------------------\n\n\ndef sol(X):\n return 1/(((9.8 - 1.8*(X**5))/(X**6))**(1/3))\n\n\n# solution points\nX_sol = np.linspace(0.05, 1.4, 100)\n# ------------------ PLOT TIME -------------------------------\n\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n\n# solution curve\nplt.plot(X_sol, sol(X_sol),\n color='#E78708',\n label=r'$y^{-3} = -\\frac{9}{5} x^{-1} + \\frac{49}{5}x^{-6}$')\n\n# direction field\nplt.quiver(X_dirField, Y_dirField,\n U, V,\n width=0.0018,\n headwidth=4,\n headlength=5,\n label='Campo Direccional')\n\n# Add title and label to the axes\nplt.title('Direction field $+$ solution curve', size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y(x)$', size=20, labelpad=10)\nplt.xlim(0, 2)\nplt.ylim(-0.1, 4)\n\n\n# add legend\nlegend = plt.legend(loc=0, prop={'size': 15})\nlegend.get_frame().set_color('#FFFFFF')\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Family of Curves\n\n# **Differential Equation**\n# $$x\\frac{dx}{dy} = \\sqrt{x^{2} + y^{2}}+y$$\n#\n# **General Solution**\n# $$y(x) = x\\sin\\big( \\ln(x) + C_{1} \\big)$$\n\n\n# define our solution\ndef sol(x, c):\n return x*np.sin(c + np.log(x))\n\n\n# define the domaing\nX = np.linspace(0.001, 20, 150)\n\n# c1 values\nc1 = [-10, -5, 0, 5, 10]\ncolorplot = ['#182825', '#016FB9', '#22AED1', '#D81159', '#FBB13C']\n\n\n# label our curves\nnames = [r\"$C_{1} = -10$\", r\"$C_{1} = -5$\",\n r\"$C_{1} = 0$\", r\"$C_{1} = 5$\", r\"$C_{1} = 10$\"]\n\n# use and specific style sheet for this plot\nwith plt.style.context('seaborn-paper'):\n # create a figure\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n # use a for loop to plot all the curves\n for i in range(len(c1)):\n plt.plot(X, sol(X, c1[i]),\n color=colorplot[i],\n label=names[i]\n )\n\n # Add title and label to the axes\n plt.title(r'Solution curves for $x \\ \\frac{dx}{dy} = \\sqrt{x^{2} + y^{2}}+y$',\n size=25, pad=15)\n plt.xlabel('$x$', size=20, labelpad=10)\n plt.ylabel('$y(x)$', size=20, labelpad=10)\n\n # add legend\n legend = plt.legend(loc=0, prop={'size': 15})\n legend.get_frame().set_color('#FFFFFF')\n\n # increase the size of the ticks\n plt.tick_params(labelsize=15)\n\n plt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Orthogonal trajectories\n\n# **Family of curves**\n# $$y^{2}-x^{2} = c$$\n#\n# **Orthogonal trajectories**\n# $$y(x) = kx^{-1}$$\n#\n# **where $k$ and $c$ are integer parameters**\n\n\n# --------------------------- FAMILY OF CURVES ---------------------\n# define our solution\ndef family(x, c):\n return np.sqrt(c + x**2)\n\n\n# define the domaing\nX = np.linspace(-10, 10, 150)\n\n# c1 values\nc_parameter = [10, 20, 30, 40, 50]\ncolorplot = ['#0290CC', '#022BCC', '#3E02CC', '#A302CC', '#CC0290']\n\n# create a figure\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n# use a for loop to plot all the curves\nfor i in range(len(c_parameter)):\n plt.plot(X, family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, -1*family(X, c_parameter[i]),\n color=colorplot[i],\n )\n\n\n# Add title and label to the axes\nplt.title(r'Family of curves',\n size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y$', size=20, labelpad=10)\n\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# --------------------------- FAMILY OF CURVES ---------------------\n# define function for the family\ndef family(x, c):\n return np.sqrt(c + x**2)\n\n\n# define the domaing\nX = np.linspace(-10, 10, 150)\n\n# c values\nc_parameter = [10, 20, 30, 40, 50]\ncolorplot = ['#0290CC', '#022BCC', '#3E02CC', '#A302CC', '#CC0290']\n\n# ---------------------- ORTHOGONAL TRAJECTORIES --------------------\n# same values for k and c\nk_parameter = c_parameter\n\n# define function for the orthogonal trajectories\n\n\ndef orthogonal(x, k):\n return k*(1/x)\n\n\northogonal_colors = ['#CC3E02', '#CCA302', '#90CC02', '#2BCC02', '#02CC3E']\n\n# ----------------- PLOT TIME ------------------------------------\n\n# create a figure\nplt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n )\n# use a for loop to plot all the curves\nfor i in range(len(c_parameter)):\n plt.plot(X, family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, -1*family(X, c_parameter[i]),\n color=colorplot[i],\n )\n plt.plot(X, orthogonal(X, k_parameter[i]),\n color=orthogonal_colors[i]\n )\n\n\n# Add title and label to the axes\nplt.title(r'Family of curves $+$ orthogonal trajectories',\n size=25, pad=15)\nplt.xlabel('$x$', size=20, labelpad=10)\nplt.ylabel('$y$', size=20, labelpad=10)\n\n# set limits\nplt.ylim(-10, 10)\n\n\n# increase the size of the ticks\nplt.tick_params(labelsize=15)\n\nplt.grid(b=True, alpha=0.5, linestyle='dashed')\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Logistic Equation\n\n# **Logistic Equation (with inmigration)**\n# $$\\frac{dP}{dt} = P\\left( 1 - P \\right) + 0.3 e^{-P}$$\n\n# We're using this function to fin the roots of a function\n# ```Python\n# from scipy.optimize import fsolve\n# ```\n# [Documentation of `fsolve`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fsolve.html)\n\n# ### Phase portrait\n\n\n# background color\nbackground = '#342B47'\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)\n\n\n# Domain\nP = np.linspace(-6, 6, 1000)\n\n# use fsolve to find the roots of the equation\nraices_dPdt = fsolve(f, [-5, -1, 1])\n\nwith plt.style.context('dark_background'):\n # creamos nuestra figura\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=background\n )\n ax = plt.axes()\n ax.set_facecolor(background)\n\n # plot g(P)\n plt.plot(P, f(P),\n color='#6BCBC3',\n linewidth=2,\n #label = 'Gr\u00e1fica 1'\n )\n # plot the equilibrium points\n plt.scatter(raices_dPdt, [0]*3,\n c='#ECE0DC',\n label='Equilibrium points (constant solutions)',\n s=50,\n alpha=1,\n )\n\n # color for the arrows\n color_arrows = '#5DDA52'\n\n # arrow pointing to P1\n plt.quiver([-5.5], [0], [2], [0],\n scale=20,\n color=color_arrows\n )\n # arrow pointing to P1\n plt.quiver([-3.1], [0], [-2], [0],\n scale=20,\n color=color_arrows\n )\n # arrow near to P2 but pointing away from it\n plt.quiver([-0.62], [0], [-2], [0],\n scale=20, color=color_arrows\n )\n # arrow near to P2 but pointing away from it but pointing to P3\n plt.quiver([0.025], [0], [2], [0],\n scale=20, color=color_arrows\n )\n # arrow pointing to P3\n plt.quiver([2.15], [0], [-2], [0],\n scale=20, color=color_arrows\n )\n\n # title and axes labels\n plt.title(r'Phase portrait of $\\frac{\\mathrm{d}P}{\\mathrm{d}t}= P \\left( 1 - P \\right) + 0.3 e^{-P} $',\n pad=10,\n size=25\n )\n plt.xlabel(r'$P$', size=20)\n plt.ylabel(r'$g(P)$', size=20)\n plt.tick_params(labelsize=15)\n\n # legend\n plt.legend(loc=0, prop={'size': 18}).get_frame().set_facecolor(background)\n\n # Limit the axes\n plt.ylim(-7, 5)\n plt.xlim(-6, 3)\n\n # add grid\n plt.grid(b=True, linestyle='--', alpha=0.3)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Behaviour\n\n\n# background color\nbackground = '#342B47'\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)\n\n\n# domain\nP = np.linspace(-6, 6, 1000)\n\n# use fsolve to find the roots of the equation\nraices_dPdt = fsolve(f, [-5, -1, 1])\n\nwith plt.style.context('dark_background'):\n\n # create figure and choose background\n plt.figure(figsize=(10.5, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=background\n )\n\n ax = plt.axes()\n ax.set_facecolor(background)\n\n # add the constant solutions\n for i in range(3):\n plt.plot([0, 10], [raices_dPdt[i]]*2, linestyle='--', color='white')\n\n # add text\n plt.text(4, -5.35, 'Increases', fontsize=20)\n plt.text(4, -2.45, 'Decrases', fontsize=20)\n plt.text(4, 0.25, 'Increases', fontsize=20)\n plt.text(4, 2.35, 'Decreases', fontsize=20)\n\n # title and axes lables\n plt.title(r'Behaviour of $P(t)$',\n pad=10,\n size=25\n )\n plt.xlabel('$t$ \\n[years]', size=20)\n plt.ylabel('$P(t)$ \\n[thousand of inhabitants]', size=20)\n plt.tick_params(labelsize=15)\n\n # Limit the axes\n plt.xlim(-0.1, 10.1)\n plt.ylim(-6, 4)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Logistic Function\n\n# **Solution**\n#\n# Using numerical aproximations with `odeint`\n#\n# ```Python\n# from scipy.integrate import odeint\n# ```\n#\n# [Documentation of `odeint`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.odeint.html)\n\n\n# background color\nbackground = '#342B47'\n\n# import numerical solution to ODE\n\n# import fsolve, to find roots in functions\n\n# dP/dt\n\n\ndef f(P, t=0):\n return P*(1-P) + 0.3*np.exp(-P)\n", "project_metadata": {"full_name": "isaacarroyov/ss_plots", "description": "Repositorio de gr\u00e1ficas realizadas en Python para mis boletines de servicio social (Ecuaciones Diferenciales y An\u00e1lisis Vectorial) || Repository of the plots made in Python for my social service bulletins (Differential Equations and Vector Calculus)", "topics": ["differential-equations", "math", "vector-analysis", "university", "python3", "python", "ecuaciones-diferenciales"], "git_url": "git://github.com/isaacarroyov/ss_plots.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-27T19:15:30Z", "size": 21849, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 29758848}, "last_updated": "2020-11-24T18:53:41Z"}, "intent": "# use fsolve to find the roots of the equation"}, {"original_comment": "# Fit the model, using the train sample\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER)\n# ## Basic Models\n\n# [1. Initialise](#1.-Initialise)\n# \n# [2. Generate Features](#2.-Generate-Features)\n# \n# [3. Read Data](#3.-Read-Data)\n# \n# [4. Filter Features](#4.-Filter-Features)\n# \n# [5. Set Samples & Target Features](#5.-Set-Samples-&-Target-Features)\n# \n# [6. Recategorise & Transform](#6.-Recategorise-&-Transform)\n# \n# [7. Rank & Select Features](#7.-Rank-&-Select-Features)\n# \n# [8. Model](#8.-Model)\n# \n\n# This Jupyter IPython Notebook applies the Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER).\n#\n# This Jupyter IPython Notebook extract aggregated features from the MySQL database, & then pre-process, configure & apply several modelling approaches.\n#\n# The pre-processing framework & modelling algorithms in this Notebook are developed as part of the Integrated Care project at the Health & Social Care Modelling Group (HSCMG), The University of Westminster.\n#\n# Note that some of the scripts are optional or subject to some pre-configurations. Please refer to the comments & the project documentations for further details.\n\n# \n# Copyright 2017 The Project Authors. All Rights Reserved.\n#\n# It is licensed under the Apache License, Version 2.0. you may not use this file except in compliance with the License. You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n# \n\n# ## 1. Initialise\n\n# Reload modules\n\n#%%\n\n# Reload modules\n# It is an optional step. It is useful to run when external Python modules are being modified\n# It is reloading all modules (except those excluded by %aimport) every time before executing the Python code typed.\n# Note: It may conflict with serialisation, when external modules are being modified\n\n# %load_ext autoreload\n# %autoreload 2\n\n\n# Import libraries\n\n#%%\n\n# Import Python libraries\nimport logging\nimport os\nimport sys\nimport gc\nimport pandas as pd\nfrom IPython.display import display, HTML\nfrom collections import OrderedDict\nimport numpy as np\nimport statistics\nfrom scipy.stats import stats\n\n#%%\n\n# Import local Python modules\nfrom Configs.CONSTANTS import CONSTANTS\nfrom Configs.Logger import Logger\nfrom Features.Variables import Variables\nfrom ReadersWriters.ReadersWriters import ReadersWriters\nfrom Stats.PreProcess import PreProcess\nfrom Stats.FeatureSelection import FeatureSelection\nfrom Stats.TrainingMethod import TrainingMethod\nfrom Stats.Plots import Plots\n\n#%%\n\n# Check the interpreter\nprint(\"\\nMake sure the correct Python interpreter is used!\")\nprint(sys.version)\nprint(\"\\nMake sure sys.path of the Python interpreter is correct!\")\nprint(os.getcwd())\n\n\n#

\n\n# ### 1.1. Initialise General Settings\n\n# Main configuration Settings: \n# - Specify the full path of the configuration file\n#
→ config_path\n# - Specify the full path of the output folder\n#
→ io_path\n# - Specify the application name (the suffix of the outputs file name)\n#
→ app_name\n# - Specify the sub-model name, to locate the related feature configuration, based on the \"Table_Reference_Name\" column in the configuration file\n#
→ submodel_name\n# - Specify the sub-model's the file name of the input (excluding the CSV extension)\n#
→ submodel_input_name\n#
\n#
\n#\n# External Configration Files: \n# - The MySQL database configuration setting & other configration metadata\n#
Inputs/CONFIGURATIONS_1.ini\n# - The input features' confugration file (Note: only the CSV export of the XLSX will be used by this Notebook)\n#
Inputs/config_features_path.xlsx\n#
Inputs/config_features_path.csv\n\n#%%\n\nconfig_path = os.path.abspath(\"ConfigInputs/CONFIGURATIONS.ini\")\nio_path = os.path.abspath(\"../../tmp/TCARER/Basic_prototype\")\napp_name = \"T-CARER\"\nsubmodel_name = \"hesIp\"\nsubmodel_input_name = \"tcarer_model_features_ip\"\n\nprint(\"\\n The full path of the configuration file: \\n\\t\", config_path,\n \"\\n The full path of the output folder: \\n\\t\", io_path,\n \"\\n The application name (the suffix of the outputs file name): \\n\\t\", app_name,\n \"\\n The sub-model name, to locate the related feature configuration: \\n\\t\", submodel_name,\n \"\\n The the sub-model's the file name of the input: \\n\\t\", submodel_input_name)\n\n\n#

\n\n# Initialise logs\n\n#%%\n\nif not os.path.exists(io_path):\n os.makedirs(io_path, exist_ok=True)\n\nlogger = Logger(path=io_path, app_name=app_name, ext=\"log\")\nlogger = logging.getLogger(app_name)\n\n\n# Initialise constants and some of classes\n\n#%%\n\n# Initialise constants\nCONSTANTS.set(io_path, app_name)\n\n#%%\n\n# Initialise other classes\nreaders_writers = ReadersWriters()\npreprocess = PreProcess(io_path)\nfeature_selection = FeatureSelection()\nplts = Plots()\n\n#%%\n\n# Set print settings\npd.set_option('display.width', 1600, 'display.max_colwidth', 800)\n\n\n# ### 1.2. Initialise Features Metadata\n\n# Read the input features' confugration file & store the features metadata\n\n#%%\n\n# variables settings\nfeatures_metadata = dict()\n\nfeatures_metadata_all = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=CONSTANTS.config_features_path, dataframing=True)\nfeatures_metadata = features_metadata_all.loc[(features_metadata_all[\"Selected\"] == 1) &\n (features_metadata_all[\"Table_Reference_Name\"] == submodel_name)]\nfeatures_metadata.reset_index()\n\n# print\ndisplay(features_metadata)\n\n\n# Set input features' metadata dictionaries\n\n#%%\n\n# Dictionary of features types, dtypes, & max-states\nfeatures_types = dict()\nfeatures_dtypes = dict()\nfeatures_states_values = dict()\nfeatures_names_group = dict()\n\nfor _, row in features_metadata.iterrows():\n if not pd.isnull(row[\"Variable_Max_States\"]):\n states_values = str(row[\"Variable_Max_States\"]).split(',')\n states_values = list(map(int, states_values))\n else:\n states_values = None\n\n if not pd.isnull(row[\"Variable_Aggregation\"]):\n postfixes = row[\"Variable_Aggregation\"].replace(' ', '').split(',')\n f_types = row[\"Variable_Type\"].replace(' ', '').split(',')\n f_dtypes = row[\"Variable_dType\"].replace(' ', '').split(',')\n for p in range(len(postfixes)):\n features_types[row[\"Variable_Name\"] +\n \"_\" + postfixes[p]] = f_types[p]\n features_dtypes[row[\"Variable_Name\"] + \"_\" +\n postfixes[p]] = pd.Series(dtype=f_dtypes[p])\n features_states_values[row[\"Variable_Name\"] +\n \"_\" + postfixes[p]] = states_values\n features_names_group[row[\"Variable_Name\"] + \"_\" +\n postfixes[p]] = row[\"Variable_Name\"] + \"_\" + postfixes[p]\n else:\n features_types[row[\"Variable_Name\"]] = row[\"Variable_Type\"]\n features_dtypes[row[\"Variable_Name\"]] = row[\"Variable_dType\"]\n features_states_values[row[\"Variable_Name\"]] = states_values\n features_names_group[row[\"Variable_Name\"]] = row[\"Variable_Name\"]\n if states_values is not None:\n for postfix in states_values:\n features_names_group[row[\"Variable_Name\"] +\n \"_\" + str(postfix)] = row[\"Variable_Name\"]\n\nfeatures_dtypes = pd.DataFrame(features_dtypes).dtypes\n\n#%%\n\n# Dictionary of features groups\nfeatures_types_group = OrderedDict()\n\nf_types = set([f_type for f_type in features_types.values()])\nfeatures_types_group = OrderedDict(\n zip(list(f_types), [set() for _ in range(len(f_types))]))\nfor f_name, f_type in features_types.items():\n features_types_group[f_type].add(f_name)\n\nprint(\"Available features types: \" + ','.join(f_types))\n\n\n#

\n\n# ## 2. Generate Features\n\n# Notes:\n# - It generates the final spell-wise & temporal features from the MySQL table(s), & converts it into CSV(s);\n# - It generates the CSV(s) based on the configuration file of the features (Note: only the CSV export of the XLSX will be used by this Notebook)\n#
Inputs/config_features_path.xlsx\n#
Inputs/config_features_path.csv\n\n#%%\n\nskip = True\n\n# settings\ncsv_schema = [\"my_db_schema\"]\ncsv_input_tables = [\"tcarer_features\"]\ncsv_history_tables = [\"hesIp\"]\ncsv_column_index = \"localID\"\ncsv_output_table = \"tcarer_model_features_ip\"\ncsv_query_batch_size = 100000\n\n#%%\n\nif skip is False:\n # generate the csv file\n variables = Variables(submodel_name,\n CONSTANTS.io_path,\n CONSTANTS.io_path,\n CONSTANTS.config_features_path,\n csv_output_table)\n variables.set(csv_schema, csv_input_tables, csv_history_tables,\n csv_column_index, csv_query_batch_size)\n\n\n#

\n\n# ## 3. Read Data\n\n# Read the input features from the CSV input file\n\n#%%\n\nfeatures_input = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=submodel_input_name, dataframing=True)\nfeatures_input.astype(dtype=features_dtypes)\n\nprint(\"Number of columns: \", len(features_input.columns),\n \"; Total records: \", len(features_input.index))\n\n\n# Verify features visually\n\n#%%\n\ndisplay(features_input.head())\n\n\n#

\n\n# ## 4. Filter Features\n\n# ### 4.1. Descriptive Statsistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n#%%\n\nfile_name = \"Step_04_Data_ColumnNames\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=list(\n features_input.columns.values), append=False)\nfile_name = \"Step_04_Stats_Categorical\"\no_stats = preprocess.stats_discrete_df(df=features_input, includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_04_Stats_Continuous\"\no_stats = preprocess.stats_continuous_df(df=features_input, includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\nfile_name = \"Step_04_Stats_Target\"\no_stats = preprocess.stats_discrete_df(df=features_input, includes=features_types_group[\"TARGET\"],\n file_name=file_name)\n\n\n# ### 4.2. Selected Population\n\n# #### 4.2.1. Remove Excluded Population, Remove Unused Features\n\n# Nothing to do!\n#
\n# Notes: \n# - Ideally the features must be configured before generating the CSV feature file, as it is very inefficient to derive new features at this stage\n# - This step is not necessary, if all the features are generated in prior to the generatiion of the CSV feature file\n\n#%%\n\n# Exclusion of unused features\n# excluded = [name for name in features_input.columns if name not in features_names_group.keys()]\n# features_input = features_input.drop(excluded, axis=1)\n\n# print(\"Number of columns: \", len(features_input.columns), \"; Total records: \", len(features_input.index))\n\n\n#

\n\n# ## 5. Set Samples & Target Features\n\n# ### 5.1. Set Features\n\n# #### 5.1.1. Train & Test Samples\n\n# Set the samples\n\n#%%\n\nfrac_train = 0.50\nreplace = False\nrandom_state = 100\n\nnrows = len(features_input.index)\nfeatures = {\"train\": dict(), \"test\": dict()}\nfeatures[\"train\"] = features_input.sample(\n frac=frac_train, replace=False, random_state=100)\nfeatures[\"test\"] = features_input.drop(features[\"train\"].index)\n\nfeatures[\"train\"] = features[\"train\"].reset_index(drop=True)\nfeatures[\"test\"] = features[\"test\"].reset_index(drop=True)\n\n\n# Verify features visually\n\n#%%\n\ndisplay(features_input.head())\n\n\n# Clean-Up\n\n#%%\n\nfeatures_input = None\ngc.collect()\n\n\n# #### 5.1.2. Independent & Target variable\u00b6\n\n# Set independent, target & ID features\n\n#%%\n\ntarget_labels = list(features_types_group[\"TARGET\"])\ntarget_id = [\"patientID\"]\n\n#%%\n\nfeatures[\"train_indep\"] = dict()\nfeatures[\"train_target\"] = dict()\nfeatures[\"train_id\"] = dict()\nfeatures[\"test_indep\"] = dict()\nfeatures[\"test_target\"] = dict()\nfeatures[\"test_id\"] = dict()\n\n# Independent and target features\n\n\ndef set_features_indep_target(df):\n df_targets = pd.DataFrame(\n dict(zip(target_labels, [[]] * len(target_labels))))\n for i in range(len(target_labels)):\n df_targets[target_labels[i]] = df[target_labels[i]]\n\n df_indep = df.drop(target_labels + target_id, axis=1)\n df_id = pd.DataFrame({target_id[0]: df[target_id[0]]})\n\n return df_indep, df_targets, df_id\n\n#%%\n\n# train & test sets\nfeatures[\"train_indep\"], features[\"train_target\"], features[\"train_id\"] = set_features_indep_target(\n features[\"train\"])\nfeatures[\"test_indep\"], features[\"test_target\"], features[\"test_id\"] = set_features_indep_target(\n features[\"test\"])\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# Verify features visually\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# Clean-Up\n\n#%%\n\ndel features[\"train\"]\ndel features[\"test\"]\ngc.collect()\n\n\n# ### 5.5. Save Samples\n\n# Serialise & save the samples before any feature transformation.\n#
This snapshot of the samples may be used for the population profiling\n\n#%%\n\nfile_name = \"Step_05_Features\"\nreaders_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=features)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns),\n \"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(features[\"test_indep\"]), \"}\")\n\n\n# ### 5.2. Remove - Near Zero Variance\n# In order to reduce sparseness and invalid features, highly stationary ones were withdrawn. The features that had constant counts less than or equal a threshold were \fltered out, to exclude highly constants and near-zero variances.\n#\n# The near zero variance rules are presented in below:\n# - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be greater than a threshold;\n# - Percent of unique values: The number of unique values divided by the total number of samples to be greater than the threshold\n#\n# Configure: the function\n# - The cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100\n#
→ thresh_unique_cut\n# - The cutoff for the ratio of the most common value to the second most common value (lower limit). eg. 95/5\n#
→ thresh_freq_cut\n\n#%%\n\nthresh_unique_cut = 100\nthresh_freq_cut = 1000\n\nexcludes = []\nfile_name = \"Step_05_Preprocess_NZV_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=True)\n\nfile_name = \"Step_05_Preprocess_NZV\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_05_Preprocess_NZV_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 5.3. Remove Highly Linearly Correlated\n#\n# In this step, features that were highly linearly correlated were excluded.\n#\n# Configure: the function\n# - A numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95\n#
→ thresh_corr_cut\n\n#%%\n\nthresh_corr_cut = 0.95\n\nexcludes = list(features_types_group[\"CATEGORICAL\"])\nfile_name = \"Step_05_Preprocess_Corr_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=True)\n\nfile_name = \"Step_05_Preprocess_Corr\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_05_Preprocess_Corr_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 5.4. Descriptive Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n#%%\n\n# columns\nfile_name = \"Step_05_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_05_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_05_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_05_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_05_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n#

\n\n# ## 6. Recategorise & Transform\n\n# Verify features visually\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# ### 6.1. Recategorise\n\n# Define the factorisation function to generate dummy features for the categorical features.\n\n#%%\n\ndef factorise_settings(max_categories_frac, min_categories_num, exclude_zero):\n categories_dic = dict()\n labels_dic = dict()\n dtypes_dic = dict()\n dummies = []\n\n for f_name in features_types_group[\"CATEGORICAL\"]:\n if f_name in features[\"train_indep\"]:\n # find top & valid states\n summaries = stats.itemfreq(features[\"train_indep\"][f_name])\n summaries = pd.DataFrame(\n {\"value\": summaries[:, 0], \"freq\": summaries[:, 1]})\n summaries[\"value\"] = list(map(int, summaries[\"value\"]))\n summaries = summaries.sort_values(\"freq\", ascending=False)\n summaries = list(summaries[\"value\"])\n\n # exclude zero state\n if exclude_zero is True and len(summaries) > 1:\n summaries = [s for s in summaries if s != 0]\n\n # if included in the states\n summaries = [v for v in summaries if v in set(\n features_states_values[f_name])]\n\n # limit number of states\n max_cnt = max(\n int(len(summaries) * max_categories_frac), min_categories_num)\n\n # set states\n categories_dic[f_name] = summaries[0:max_cnt]\n labels_dic[f_name] = [f_name + \"_\" +\n str(c) for c in categories_dic[f_name]]\n dtypes_dic = {**dtypes_dic,\n **dict(zip(labels_dic[f_name], [pd.Series(dtype='i') for _ in range(len(categories_dic[f_name]))]))}\n dummies += labels_dic[f_name]\n\n dtypes_dic = pd.DataFrame(dtypes_dic).dtypes\n\n # print\n print(\"Total Categorical Variables : \", len(categories_dic.keys()),\n \"; Total Number of Dummy Variables: \", sum([len(categories_dic[f_name]) for f_name in categories_dic.keys()]))\n return categories_dic, labels_dic, dtypes_dic, features_types\n\n\n# Select categories: by order of freq., max_categories_frac, & max_categories_num\n#\n#
Configure: The input arguments are:\n# - Specify the maximum number of categories a feature can have\n#
→ max_categories_frac\n# - Specify the minimum number of categories a feature can have\n#
→ min_categories_num\n# - Specify to exclude the state '0' (zero). State zero in our features represents 'any other state', including NULL\n#
→ exclude_zero = False\n\n#%%\n\nmax_categories_frac = 0.90\nmin_categories_num = 1\nexclude_zero = False # if possible remove state zero\n\ncategories_dic, labels_dic, dtypes_dic, features_types_group[\"DUMMIES\"] = factorise_settings(\n max_categories_frac, min_categories_num, exclude_zero)\n\n\n# Manually add dummy variables to the dataframe & remove the original Categorical variables\n\n#%%\n\nfeatures[\"train_indep_temp\"] = preprocess.factoring_feature_wise(\n features[\"train_indep\"], categories_dic, labels_dic, dtypes_dic, threaded=False)\nfeatures[\"test_indep_temp\"] = preprocess.factoring_feature_wise(\n features[\"test_indep\"], categories_dic, labels_dic, dtypes_dic, threaded=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# Verify features visually\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(), features[\"train_target\"].head(\n), features[\"train_indep_temp\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(), features[\"test_target\"].head(\n), features[\"test_indep_temp\"].head()], axis=1))\n\n\n# Set\n\n#%%\n\nfeatures[\"train_indep\"] = features[\"train_indep_temp\"].copy(True)\nfeatures[\"test_indep\"] = features[\"test_indep_temp\"].copy(True)\n\n\n# Clean-Up\n\n#%%\n\ndel features[\"train_indep_temp\"]\ndel features[\"test_indep_temp\"]\ngc.collect()\n\n\n# ### 6.2. Remove - Near Zero Variance\n\n# Optional: Remove more features with near zero variance, after the factorisation step.\n# Configure: the function\n\n#%%\n\n# the cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100\nthresh_unique_cut = 100\n# the cutoff for the ratio of the most common value to the second most common value (lower limit). eg. 95/5\nthresh_freq_cut = 1000\n\nexcludes = []\nfile_name = \"Step_06_Preprocess_NZV_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=True)\n\nfile_name = \"Step_06_Preprocess_NZV\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_06_Preprocess_NZV_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 6.3. Remove Highly Linearly Correlated\n\n# Optional: Remove more features with highly linearly correlated, after the factorisation step.\n# Configure: the function\n\n#%%\n\n# A numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95\nthresh_corr_cut = 0.95\n\nexcludes = []\nfile_name = \"Step_06_Preprocess_Corr_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=True)\n\nfile_name = \"Step_06_Preprocess_Corr\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_06_Preprocess_Corr_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 6.4. Descriptive Statsistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n#%%\n\n# columns\nfile_name = \"Step_06_4_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_06_4_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_4_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_06_4_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_4_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n# ### 6.5. Transformations\n\n# Verify features visually\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# Tranformation: scale\n# Note:: It is highly resource intensive\n\n#%%\n\ntransform_type = \"scale\"\nkwargs = {\"with_mean\": True}\nmethod_args = dict()\nexcludes = list(features_types_group[\"CATEGORICAL\"]) + \\\n list(features_types_group[\"DUMMIES\"])\n\nfeatures[\"train_indep\"], method_args = preprocess.transform_df(df=features[\"train_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\nfeatures[\"test_indep\"], _ = preprocess.transform_df(df=features[\"test_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\n\n# print(\"Metod arguments:\", method_args)\n\n\n# Tranformation: Yeo-Johnson\n# Note:: It is highly resource intensive\n\n#%%\n\ntransform_type = \"yeo_johnson\"\nkwargs = {\"lmbda\": -0.5, \"derivative\": 0,\n \"epsilon\": np.finfo(np.float).eps, \"inverse\": False}\nmethod_args = dict()\nexcludes = list(features_types_group[\"CATEGORICAL\"]) + \\\n list(features_types_group[\"DUMMIES\"])\n\nfeatures[\"train_indep\"], method_args = preprocess.transform_df(df=features[\"train_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\nfeatures[\"test_indep\"], _ = preprocess.transform_df(df=features[\"test_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\n\n# print(\"Metod arguments:\", method_args)\n\n\n# Visual verification\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# ### 6.6. Summary Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n#%%\n\n# Statsistics report for 'Categorical', 'Continuous', & 'TARGET' variables\n# columns\nfile_name = \"Step_06_6_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_06_6_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_6_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_06_6_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_6_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n#

\n\n# ## 7. Rank & Select Features\n\n# Configure: the general settings\n\n#%%\n\n# select the target variable\ntarget_feature = \"label365\" # \"label30\", \"label365\"\n\n# number of trials\nnum_trials = 1\n\nmodel_rank = dict()\no_summaries_df = dict()\n\n\n# ### 7.1. Define\n\n# Ranking Method: Random forest classifier (Brieman)\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n#%%\n\ndef rank_random_forest_brieman(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": False, \"class_weight\": None})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 50, \"min_samples_leaf\": 25,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": False, \"class_weight\": None})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 40, \"min_samples_leaf\": 20,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": True, \"class_weight\": None})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(\n model_rank[i].feature_importances_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# Ranking Method: Gradient Boosted Regression Trees (GBRT)\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n#%%\n\ndef rank_gbrt(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 10, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 5, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 3, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(\n model_rank[i].feature_importances_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# Ranking Method: Randomized Logistic Regression\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n#%%\n\ndef rank_randLogit(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.75, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.50, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.90, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(model_rank[i].scores_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# ### 7.2. Run\n\n# Run one or more feature ranking methods and trials\n\n# Ranking Method: Random forest classifier (Brieman)\n# Note:: It is moderately resource intensive\n\n#%%\n\nrank_model = \"rfc\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_random_forest_brieman(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# Ranking Method: Gradient Boosted Regression Trees (GBRT)\n# Note:: It is moderately resource intensive\n\n#%%\n\nrank_model = \"gbrt\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_gbrt(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# Ranking Method: Randomized Logistic Regression\n# Note:: It is moderately resource intensive\n\n#%%\n\nrank_model = \"randLogit\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_randLogit(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# ### 7.3. Summaries\n\n#%%\n\n# combine scores\ndef rank_summarise(features_arg, o_summaries_df_arg):\n summaries_temp = {'Order_avg': [], 'Order_max': [],\n 'Order_min': [], 'Importance_avg': []}\n summary_order = []\n summary_importance = []\n\n for f_name in list(features_arg.columns.values):\n for i in range(len(o_summaries_df_arg)):\n summary_order.append(\n o_summaries_df_arg[i][o_summaries_df_arg[i]['Name'] == f_name]['Order'].values)\n summary_importance.append(\n o_summaries_df_arg[i][o_summaries_df_arg[i]['Name'] == f_name]['Importance'].values)\n\n summaries_temp['Order_avg'].append(\n statistics.mean(np.concatenate(summary_order)))\n summaries_temp['Order_max'].append(max(np.concatenate(summary_order)))\n summaries_temp['Order_min'].append(min(np.concatenate(summary_order)))\n summaries_temp['Importance_avg'].append(\n statistics.mean(np.concatenate(summary_importance)))\n\n summaries_df = pd.DataFrame({'Name': list(features_arg.columns.values)})\n summaries_df['Order_avg'] = summaries_temp['Order_avg']\n summaries_df['Order_max'] = summaries_temp['Order_max']\n summaries_df['Order_min'] = summaries_temp['Order_min']\n summaries_df['Importance_avg'] = summaries_temp['Importance_avg']\n summaries_df = summaries_df.sort_values(['Order_avg'], ascending=[1])\n return summaries_df\n\n#%%\n\n# combine scores\nsummaries_df = dict()\n\nfor rank_model in o_summaries_df.keys():\n summaries_df[rank_model] = dict()\n summaries_df[rank_model] = rank_summarise(\n features[\"train_indep\"], o_summaries_df[rank_model])\n\n\n# Save\n\n#%%\n\nfor rank_model in model_rank.keys():\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n readers_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=model_rank[rank_model])\n\n file_name = \"Step_07_Model_Train_model_rank_summaries_\" + rank_model\n readers_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=o_summaries_df[rank_model])\n\n\n# ### 7.4. Select Top Features\n\n# Configure: the selection method\n\n#%%\n\nrank_model = \"rfc\"\nfile_name = \"Step_07_Top_Features_\" + rank_model\nrank_top_features_max = 400\nrank_top_features_score_min = 0.1 * (10 ^ -20)\n\n# sort features\nfeatures_names_selected = summaries_df[rank_model]['Name'][summaries_df[rank_model]\n ['Order_avg'] >= rank_top_features_score_min]\nfeatures_names_selected = (\n features_names_selected[0:rank_top_features_max]).tolist()\n\n\n# Save\n\n#%%\n\n# save to CSV\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=features_names_selected, append=False, header=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\nprint(\"List of sorted features, which can be modified:\\n \" +\n CONSTANTS.io_path + file_name + \"csv\")\n\n\n# Configure: the selected feature manually if it isnecessary!\n\n#%%\n\nfile_name = \"Step_07_Top_Features_rfc_adhoc\"\n\nfeatures_names_selected = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=file_name, dataframing=False)[0]\nfeatures_names_selected = [f.replace(\"\\n\", \"\")\n for f in features_names_selected]\ndisplay(pd.DataFrame(features_names_selected))\n\n\n# Verify the top features visually\n\n#%%\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns),\n \";\\nNumber of top columns: \", len(features[\"train_indep\"][features_names_selected].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"][features_names_selected]), \", test: \", len(\n features[\"test_indep\"][features_names_selected]), \"}\")\n\n\n# ### 7.5. Summary Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n#%%\n\n# columns\nfile_name = \"Step_07_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"][features_names_selected].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_07_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"][features_names_selected], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_07_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"][features_names_selected], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_07_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"][features_names_selected], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_07_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"][features_names_selected], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n# ### 7.6. Save Features\n\n#%%\n\nfile_name = \"Step_07_Features\"\nreaders_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=features)\n\n# print\nprint(\"File size: \", os.stat(os.path.join(\n CONSTANTS.io_path, file_name + \".bz2\")).st_size)\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n#

\n\n#

\n\n# ## 8. Model\n\n# Load a Saved Samples and Features Ranking:\n#
It is an optional step. The step loads the serialised & compressed outputs of Step-7.\n\n#%%\n\n# open fetures\nfile_name = \"Step_07_Features\"\nfeatures = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n# print\nprint(\"File size: \", os.stat(os.path.join(\n CONSTANTS.io_path, file_name + \".bz2\")).st_size)\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n#%%\n\n# open scoring model files\nrank_models = [\"rfc\", \"gbrt\", \"randLogit\"]\nmodel_rank = dict()\no_summaries_df = dict()\n\nfor rank_model in rank_models:\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n if not readers_writers.exists_serialised(path=CONSTANTS.io_path, title=file_name, ext=\"bz2\"):\n continue\n\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n model_rank[rank_model] = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n file_name = \"Step_07_Model_Train_model_rank_summaries_\" + rank_model\n o_summaries_df[rank_model] = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n\n# Verify features visually\n\n#%%\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n#

\n\n# ### 8.1. Initialise\n\n# #### 8.1.1. Algorithms\n\n# Configure: the trianing algorithm\n\n# Algorithm 1: Random Forest\n\n#%%\n\nmethod_name = \"rfc\"\nkwargs = {\"n_estimators\": 20, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 100,\n \"min_samples_leaf\": 50, \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto',\n \"max_leaf_nodes\": None, \"bootstrap\": True, \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None,\n \"verbose\": 0, \"warm_start\": False, \"class_weight\": \"balanced_subsample\"}\n\n\n# Algorithm 2: Logistic Regression\n\n#%%\n\nmethod_name = \"lr\"\nkwargs = {\"penalty\": 'l1', \"dual\": False, \"tol\": 0.0001, \"C\": 1, \"fit_intercept\": True, \"intercept_scaling\": 1,\n \"class_weight\": None, \"random_state\": None, \"solver\": 'liblinear', \"max_iter\": 100, \"multi_class\": 'ovr',\n \"verbose\": 0, \"warm_start\": False, \"n_jobs\": -1}\n\n\n# Algorithm 3: Logistic Cross-Validation\n\n#%%\n\nmethod_name = \"lr_cv\"\nkwargs = {\"Cs\": 10, \"fit_intercept\": True, \"cv\": None, \"dual\": False, \"penalty\": 'l2', \"scoring\": None,\n \"solver\": 'lbfgs', \"tol\": 0.0001, \"max_iter\": 10, \"class_weight\": None, \"n_jobs\": -1, \"verbose\": 0,\n \"refit\": True, \"intercept_scaling\": 1.0, \"multi_class\": \"ovr\", \"random_state\": None}\n\n\n# Algorithm 4: Neural Network\n\n#%%\n\nmethod_name = \"nn\"\nkwargs = {\"solver\": 'lbfgs', \"alpha\": 1e-5,\n \"hidden_layer_sizes\": (5, 2), \"random_state\": 1}\n\n\n# Algorithm 5: k-Nearest Neighbourhood\n\n#%%\n\nmethod_name = \"knc\"\nkwargs = {\"n_neighbors\": 5, \"weights\": 'distance', \"algorithm\": 'auto', \"leaf_size\": 30,\n \"p\": 2, \"metric\": 'minkowski', \"metric_params\": None, \"n_jobs\": -1}\n\n\n# Algorithm 6: Decision Tree\n\n#%%\n\nmethod_name = \"dtc\"\nkwargs = {\"criterion\": 'gini', \"splitter\": 'best', \"max_depth\": None, \"min_samples_split\": 30,\n \"min_samples_leaf\": 30, \"min_weight_fraction_leaf\": 0.0, \"max_features\": None,\n \"random_state\": None, \"max_leaf_nodes\": None, \"class_weight\": None, \"presort\": False}\n\n\n# Algorithm 7: Gradient Boosting Classifier\n\n#%%\n\nmethod_name = \"gbc\"\nkwargs = {\"loss\": 'deviance', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 30,\n \"min_samples_leaf\": 30, \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 3, \"init\": None, \"random_state\": None,\n \"max_features\": None, \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": 'auto'}\n\n\n# Algorithm 8: Naive Bayes
\n# Note: features must be positive\n\n#%%\n\nmethod_name = \"nb\"\ntraining_method = TrainingMethod(method_name)\nkwargs = {\"alpha\": 1.0, \"fit_prior\": True, \"class_prior\": None}\n\n\n#

\n\n# #### 8.1.2. Other Settings\n\n# Configure: other modelling settings\n\n#%%\n\n# select the target variable\ntarget_feature = \"label365\" # \"label30\" , \"label365\"\n\n# file name\nfile_name = \"Step_09_Model_\" + method_name + \"_\" + target_feature\n\n# initialise\ntraining_method = TrainingMethod(method_name)\n\n\n# #### 8.1.3. Features\n\n#%%\n\n# features[\"train_indep\"][features_names_selected], features[\"train_indep\"]\nsample_train = features[\"train_indep\"][features_names_selected]\n# features[\"train_target\"][target_feature]\nsample_train_target = features[\"train_target\"][target_feature]\n# features[\"test_indep\"][features_names_selected], features[\"test_indep\"]\nsample_test = features[\"test_indep\"][features_names_selected]\n# features[\"test_target\"][target_feature]\nsample_test_target = features[\"test_target\"][target_feature]\n\n\n# ### 8.3. Fit", "target_code": "model = training_method.train(sample_train, sample_train_target, **kwargs)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER)\n# ## Basic Models\n\n# [1. Initialise](#1.-Initialise)\n# \n# [2. Generate Features](#2.-Generate-Features)\n# \n# [3. Read Data](#3.-Read-Data)\n# \n# [4. Filter Features](#4.-Filter-Features)\n# \n# [5. Set Samples & Target Features](#5.-Set-Samples-&-Target-Features)\n# \n# [6. Recategorise & Transform](#6.-Recategorise-&-Transform)\n# \n# [7. Rank & Select Features](#7.-Rank-&-Select-Features)\n# \n# [8. Model](#8.-Model)\n# \n\n# This Jupyter IPython Notebook applies the Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER).\n#\n# This Jupyter IPython Notebook extract aggregated features from the MySQL database, & then pre-process, configure & apply several modelling approaches.\n#\n# The pre-processing framework & modelling algorithms in this Notebook are developed as part of the Integrated Care project at the Health & Social Care Modelling Group (HSCMG), The University of Westminster.\n#\n# Note that some of the scripts are optional or subject to some pre-configurations. Please refer to the comments & the project documentations for further details.\n\n# \n# Copyright 2017 The Project Authors. All Rights Reserved.\n#\n# It is licensed under the Apache License, Version 2.0. you may not use this file except in compliance with the License. You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n# \n\n# ## 1. Initialise\n\n# Reload modules\n\n\n# Reload modules\n# It is an optional step. It is useful to run when external Python modules are being modified\n# It is reloading all modules (except those excluded by %aimport) every time before executing the Python code typed.\n# Note: It may conflict with serialisation, when external modules are being modified\n\n# %load_ext autoreload\n# %autoreload 2\n\n\n# Import libraries\n\n\n# Import Python libraries\nimport logging\nimport os\nimport sys\nimport gc\nimport pandas as pd\nfrom IPython.display import display, HTML\nfrom collections import OrderedDict\nimport numpy as np\nimport statistics\nfrom scipy.stats import stats\n\n\n# Import local Python modules\nfrom Configs.CONSTANTS import CONSTANTS\nfrom Configs.Logger import Logger\nfrom Features.Variables import Variables\nfrom ReadersWriters.ReadersWriters import ReadersWriters\nfrom Stats.PreProcess import PreProcess\nfrom Stats.FeatureSelection import FeatureSelection\nfrom Stats.TrainingMethod import TrainingMethod\nfrom Stats.Plots import Plots\n\n\n# Check the interpreter\nprint(\"\\nMake sure the correct Python interpreter is used!\")\nprint(sys.version)\nprint(\"\\nMake sure sys.path of the Python interpreter is correct!\")\nprint(os.getcwd())\n\n\n#

\n\n# ### 1.1. Initialise General Settings\n\n# Main configuration Settings: \n# - Specify the full path of the configuration file\n#
→ config_path\n# - Specify the full path of the output folder\n#
→ io_path\n# - Specify the application name (the suffix of the outputs file name)\n#
→ app_name\n# - Specify the sub-model name, to locate the related feature configuration, based on the \"Table_Reference_Name\" column in the configuration file\n#
→ submodel_name\n# - Specify the sub-model's the file name of the input (excluding the CSV extension)\n#
→ submodel_input_name\n#
\n#
\n#\n# External Configration Files: \n# - The MySQL database configuration setting & other configration metadata\n#
Inputs/CONFIGURATIONS_1.ini\n# - The input features' confugration file (Note: only the CSV export of the XLSX will be used by this Notebook)\n#
Inputs/config_features_path.xlsx\n#
Inputs/config_features_path.csv\n\n\nconfig_path = os.path.abspath(\"ConfigInputs/CONFIGURATIONS.ini\")\nio_path = os.path.abspath(\"../../tmp/TCARER/Basic_prototype\")\napp_name = \"T-CARER\"\nsubmodel_name = \"hesIp\"\nsubmodel_input_name = \"tcarer_model_features_ip\"\n\nprint(\"\\n The full path of the configuration file: \\n\\t\", config_path,\n \"\\n The full path of the output folder: \\n\\t\", io_path,\n \"\\n The application name (the suffix of the outputs file name): \\n\\t\", app_name,\n \"\\n The sub-model name, to locate the related feature configuration: \\n\\t\", submodel_name,\n \"\\n The the sub-model's the file name of the input: \\n\\t\", submodel_input_name)\n\n\n#

\n\n# Initialise logs\n\n\nif not os.path.exists(io_path):\n os.makedirs(io_path, exist_ok=True)\n\nlogger = Logger(path=io_path, app_name=app_name, ext=\"log\")\nlogger = logging.getLogger(app_name)\n\n\n# Initialise constants and some of classes\n\n\n# Initialise constants\nCONSTANTS.set(io_path, app_name)\n\n\n# Initialise other classes\nreaders_writers = ReadersWriters()\npreprocess = PreProcess(io_path)\nfeature_selection = FeatureSelection()\nplts = Plots()\n\n\n# Set print settings\npd.set_option('display.width', 1600, 'display.max_colwidth', 800)\n\n\n# ### 1.2. Initialise Features Metadata\n\n# Read the input features' confugration file & store the features metadata\n\n\n# variables settings\nfeatures_metadata = dict()\n\nfeatures_metadata_all = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=CONSTANTS.config_features_path, dataframing=True)\nfeatures_metadata = features_metadata_all.loc[(features_metadata_all[\"Selected\"] == 1) &\n (features_metadata_all[\"Table_Reference_Name\"] == submodel_name)]\nfeatures_metadata.reset_index()\n\n# print\ndisplay(features_metadata)\n\n\n# Set input features' metadata dictionaries\n\n\n# Dictionary of features types, dtypes, & max-states\nfeatures_types = dict()\nfeatures_dtypes = dict()\nfeatures_states_values = dict()\nfeatures_names_group = dict()\n\nfor _, row in features_metadata.iterrows():\n if not pd.isnull(row[\"Variable_Max_States\"]):\n states_values = str(row[\"Variable_Max_States\"]).split(',')\n states_values = list(map(int, states_values))\n else:\n states_values = None\n\n if not pd.isnull(row[\"Variable_Aggregation\"]):\n postfixes = row[\"Variable_Aggregation\"].replace(' ', '').split(',')\n f_types = row[\"Variable_Type\"].replace(' ', '').split(',')\n f_dtypes = row[\"Variable_dType\"].replace(' ', '').split(',')\n for p in range(len(postfixes)):\n features_types[row[\"Variable_Name\"] +\n \"_\" + postfixes[p]] = f_types[p]\n features_dtypes[row[\"Variable_Name\"] + \"_\" +\n postfixes[p]] = pd.Series(dtype=f_dtypes[p])\n features_states_values[row[\"Variable_Name\"] +\n \"_\" + postfixes[p]] = states_values\n features_names_group[row[\"Variable_Name\"] + \"_\" +\n postfixes[p]] = row[\"Variable_Name\"] + \"_\" + postfixes[p]\n else:\n features_types[row[\"Variable_Name\"]] = row[\"Variable_Type\"]\n features_dtypes[row[\"Variable_Name\"]] = row[\"Variable_dType\"]\n features_states_values[row[\"Variable_Name\"]] = states_values\n features_names_group[row[\"Variable_Name\"]] = row[\"Variable_Name\"]\n if states_values is not None:\n for postfix in states_values:\n features_names_group[row[\"Variable_Name\"] +\n \"_\" + str(postfix)] = row[\"Variable_Name\"]\n\nfeatures_dtypes = pd.DataFrame(features_dtypes).dtypes\n\n\n# Dictionary of features groups\nfeatures_types_group = OrderedDict()\n\nf_types = set([f_type for f_type in features_types.values()])\nfeatures_types_group = OrderedDict(\n zip(list(f_types), [set() for _ in range(len(f_types))]))\nfor f_name, f_type in features_types.items():\n features_types_group[f_type].add(f_name)\n\nprint(\"Available features types: \" + ','.join(f_types))\n\n\n#

\n\n# ## 2. Generate Features\n\n# Notes:\n# - It generates the final spell-wise & temporal features from the MySQL table(s), & converts it into CSV(s);\n# - It generates the CSV(s) based on the configuration file of the features (Note: only the CSV export of the XLSX will be used by this Notebook)\n#
Inputs/config_features_path.xlsx\n#
Inputs/config_features_path.csv\n\n\nskip = True\n\n# settings\ncsv_schema = [\"my_db_schema\"]\ncsv_input_tables = [\"tcarer_features\"]\ncsv_history_tables = [\"hesIp\"]\ncsv_column_index = \"localID\"\ncsv_output_table = \"tcarer_model_features_ip\"\ncsv_query_batch_size = 100000\n\n\nif skip is False:\n # generate the csv file\n variables = Variables(submodel_name,\n CONSTANTS.io_path,\n CONSTANTS.io_path,\n CONSTANTS.config_features_path,\n csv_output_table)\n variables.set(csv_schema, csv_input_tables, csv_history_tables,\n csv_column_index, csv_query_batch_size)\n\n\n#

\n\n# ## 3. Read Data\n\n# Read the input features from the CSV input file\n\n\nfeatures_input = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=submodel_input_name, dataframing=True)\nfeatures_input.astype(dtype=features_dtypes)\n\nprint(\"Number of columns: \", len(features_input.columns),\n \"; Total records: \", len(features_input.index))\n\n\n# Verify features visually\n\n\ndisplay(features_input.head())\n\n\n#

\n\n# ## 4. Filter Features\n\n# ### 4.1. Descriptive Statsistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n\nfile_name = \"Step_04_Data_ColumnNames\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=list(\n features_input.columns.values), append=False)\nfile_name = \"Step_04_Stats_Categorical\"\no_stats = preprocess.stats_discrete_df(df=features_input, includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_04_Stats_Continuous\"\no_stats = preprocess.stats_continuous_df(df=features_input, includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\nfile_name = \"Step_04_Stats_Target\"\no_stats = preprocess.stats_discrete_df(df=features_input, includes=features_types_group[\"TARGET\"],\n file_name=file_name)\n\n\n# ### 4.2. Selected Population\n\n# #### 4.2.1. Remove Excluded Population, Remove Unused Features\n\n# Nothing to do!\n#
\n# Notes: \n# - Ideally the features must be configured before generating the CSV feature file, as it is very inefficient to derive new features at this stage\n# - This step is not necessary, if all the features are generated in prior to the generatiion of the CSV feature file\n\n\n# Exclusion of unused features\n# excluded = [name for name in features_input.columns if name not in features_names_group.keys()]\n# features_input = features_input.drop(excluded, axis=1)\n\n# print(\"Number of columns: \", len(features_input.columns), \"; Total records: \", len(features_input.index))\n\n\n#

\n\n# ## 5. Set Samples & Target Features\n\n# ### 5.1. Set Features\n\n# #### 5.1.1. Train & Test Samples\n\n# Set the samples\n\n\nfrac_train = 0.50\nreplace = False\nrandom_state = 100\n\nnrows = len(features_input.index)\nfeatures = {\"train\": dict(), \"test\": dict()}\nfeatures[\"train\"] = features_input.sample(\n frac=frac_train, replace=False, random_state=100)\nfeatures[\"test\"] = features_input.drop(features[\"train\"].index)\n\nfeatures[\"train\"] = features[\"train\"].reset_index(drop=True)\nfeatures[\"test\"] = features[\"test\"].reset_index(drop=True)\n\n\n# Verify features visually\n\n\ndisplay(features_input.head())\n\n\n# Clean-Up\n\n\nfeatures_input = None\ngc.collect()\n\n\n# #### 5.1.2. Independent & Target variable\u00b6\n\n# Set independent, target & ID features\n\n\ntarget_labels = list(features_types_group[\"TARGET\"])\ntarget_id = [\"patientID\"]\n\n\nfeatures[\"train_indep\"] = dict()\nfeatures[\"train_target\"] = dict()\nfeatures[\"train_id\"] = dict()\nfeatures[\"test_indep\"] = dict()\nfeatures[\"test_target\"] = dict()\nfeatures[\"test_id\"] = dict()\n\n# Independent and target features\n\n\ndef set_features_indep_target(df):\n df_targets = pd.DataFrame(\n dict(zip(target_labels, [[]] * len(target_labels))))\n for i in range(len(target_labels)):\n df_targets[target_labels[i]] = df[target_labels[i]]\n\n df_indep = df.drop(target_labels + target_id, axis=1)\n df_id = pd.DataFrame({target_id[0]: df[target_id[0]]})\n\n return df_indep, df_targets, df_id\n\n\n# train & test sets\nfeatures[\"train_indep\"], features[\"train_target\"], features[\"train_id\"] = set_features_indep_target(\n features[\"train\"])\nfeatures[\"test_indep\"], features[\"test_target\"], features[\"test_id\"] = set_features_indep_target(\n features[\"test\"])\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# Verify features visually\n\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# Clean-Up\n\n\ndel features[\"train\"]\ndel features[\"test\"]\ngc.collect()\n\n\n# ### 5.5. Save Samples\n\n# Serialise & save the samples before any feature transformation.\n#
This snapshot of the samples may be used for the population profiling\n\n\nfile_name = \"Step_05_Features\"\nreaders_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=features)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns),\n \"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(features[\"test_indep\"]), \"}\")\n\n\n# ### 5.2. Remove - Near Zero Variance\n# In order to reduce sparseness and invalid features, highly stationary ones were withdrawn. The features that had constant counts less than or equal a threshold were \fltered out, to exclude highly constants and near-zero variances.\n#\n# The near zero variance rules are presented in below:\n# - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be greater than a threshold;\n# - Percent of unique values: The number of unique values divided by the total number of samples to be greater than the threshold\n#\n# Configure: the function\n# - The cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100\n#
→ thresh_unique_cut\n# - The cutoff for the ratio of the most common value to the second most common value (lower limit). eg. 95/5\n#
→ thresh_freq_cut\n\n\nthresh_unique_cut = 100\nthresh_freq_cut = 1000\n\nexcludes = []\nfile_name = \"Step_05_Preprocess_NZV_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=True)\n\nfile_name = \"Step_05_Preprocess_NZV\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_05_Preprocess_NZV_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 5.3. Remove Highly Linearly Correlated\n#\n# In this step, features that were highly linearly correlated were excluded.\n#\n# Configure: the function\n# - A numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95\n#
→ thresh_corr_cut\n\n\nthresh_corr_cut = 0.95\n\nexcludes = list(features_types_group[\"CATEGORICAL\"])\nfile_name = \"Step_05_Preprocess_Corr_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=True)\n\nfile_name = \"Step_05_Preprocess_Corr\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_05_Preprocess_Corr_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 5.4. Descriptive Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n\n# columns\nfile_name = \"Step_05_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_05_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_05_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_05_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_05_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n#

\n\n# ## 6. Recategorise & Transform\n\n# Verify features visually\n\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# ### 6.1. Recategorise\n\n# Define the factorisation function to generate dummy features for the categorical features.\n\n\ndef factorise_settings(max_categories_frac, min_categories_num, exclude_zero):\n categories_dic = dict()\n labels_dic = dict()\n dtypes_dic = dict()\n dummies = []\n\n for f_name in features_types_group[\"CATEGORICAL\"]:\n if f_name in features[\"train_indep\"]:\n # find top & valid states\n summaries = stats.itemfreq(features[\"train_indep\"][f_name])\n summaries = pd.DataFrame(\n {\"value\": summaries[:, 0], \"freq\": summaries[:, 1]})\n summaries[\"value\"] = list(map(int, summaries[\"value\"]))\n summaries = summaries.sort_values(\"freq\", ascending=False)\n summaries = list(summaries[\"value\"])\n\n # exclude zero state\n if exclude_zero is True and len(summaries) > 1:\n summaries = [s for s in summaries if s != 0]\n\n # if included in the states\n summaries = [v for v in summaries if v in set(\n features_states_values[f_name])]\n\n # limit number of states\n max_cnt = max(\n int(len(summaries) * max_categories_frac), min_categories_num)\n\n # set states\n categories_dic[f_name] = summaries[0:max_cnt]\n labels_dic[f_name] = [f_name + \"_\" +\n str(c) for c in categories_dic[f_name]]\n dtypes_dic = {**dtypes_dic,\n **dict(zip(labels_dic[f_name], [pd.Series(dtype='i') for _ in range(len(categories_dic[f_name]))]))}\n dummies += labels_dic[f_name]\n\n dtypes_dic = pd.DataFrame(dtypes_dic).dtypes\n\n # print\n print(\"Total Categorical Variables : \", len(categories_dic.keys()),\n \"; Total Number of Dummy Variables: \", sum([len(categories_dic[f_name]) for f_name in categories_dic.keys()]))\n return categories_dic, labels_dic, dtypes_dic, features_types\n\n\n# Select categories: by order of freq., max_categories_frac, & max_categories_num\n#\n#
Configure: The input arguments are:\n# - Specify the maximum number of categories a feature can have\n#
→ max_categories_frac\n# - Specify the minimum number of categories a feature can have\n#
→ min_categories_num\n# - Specify to exclude the state '0' (zero). State zero in our features represents 'any other state', including NULL\n#
→ exclude_zero = False\n\n\nmax_categories_frac = 0.90\nmin_categories_num = 1\nexclude_zero = False # if possible remove state zero\n\ncategories_dic, labels_dic, dtypes_dic, features_types_group[\"DUMMIES\"] = factorise_settings(\n max_categories_frac, min_categories_num, exclude_zero)\n\n\n# Manually add dummy variables to the dataframe & remove the original Categorical variables\n\n\nfeatures[\"train_indep_temp\"] = preprocess.factoring_feature_wise(\n features[\"train_indep\"], categories_dic, labels_dic, dtypes_dic, threaded=False)\nfeatures[\"test_indep_temp\"] = preprocess.factoring_feature_wise(\n features[\"test_indep\"], categories_dic, labels_dic, dtypes_dic, threaded=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# Verify features visually\n\n\ndisplay(pd.concat([features[\"train_id\"].head(), features[\"train_target\"].head(\n), features[\"train_indep_temp\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(), features[\"test_target\"].head(\n), features[\"test_indep_temp\"].head()], axis=1))\n\n\n# Set\n\n\nfeatures[\"train_indep\"] = features[\"train_indep_temp\"].copy(True)\nfeatures[\"test_indep\"] = features[\"test_indep_temp\"].copy(True)\n\n\n# Clean-Up\n\n\ndel features[\"train_indep_temp\"]\ndel features[\"test_indep_temp\"]\ngc.collect()\n\n\n# ### 6.2. Remove - Near Zero Variance\n\n# Optional: Remove more features with near zero variance, after the factorisation step.\n# Configure: the function\n\n\n# the cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100\nthresh_unique_cut = 100\n# the cutoff for the ratio of the most common value to the second most common value (lower limit). eg. 95/5\nthresh_freq_cut = 1000\n\nexcludes = []\nfile_name = \"Step_06_Preprocess_NZV_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=True)\n\nfile_name = \"Step_06_Preprocess_NZV\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_06_Preprocess_NZV_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.near_zero_var_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_unique_cut=thresh_unique_cut,\n thresh_freq_cut=thresh_freq_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 6.3. Remove Highly Linearly Correlated\n\n# Optional: Remove more features with highly linearly correlated, after the factorisation step.\n# Configure: the function\n\n\n# A numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95\nthresh_corr_cut = 0.95\n\nexcludes = []\nfile_name = \"Step_06_Preprocess_Corr_config\"\nfeatures[\"train_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"train_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=True)\n\nfile_name = \"Step_06_Preprocess_Corr\"\nreaders_writers.save_text(path=CONSTANTS.io_path, title=file_name,\n data=o_summaries, append=False, ext=\"log\")\n\nfile_name = \"Step_06_Preprocess_Corr_config\"\nfeatures[\"test_indep\"], o_summaries = preprocess.high_linear_correlation_df(df=features[\"test_indep\"],\n excludes=excludes,\n file_name=file_name,\n thresh_corr_cut=thresh_corr_cut,\n to_search=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# ### 6.4. Descriptive Statsistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n\n# columns\nfile_name = \"Step_06_4_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_06_4_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_4_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_06_4_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_4_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n# ### 6.5. Transformations\n\n# Verify features visually\n\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# Tranformation: scale\n# Note:: It is highly resource intensive\n\n\ntransform_type = \"scale\"\nkwargs = {\"with_mean\": True}\nmethod_args = dict()\nexcludes = list(features_types_group[\"CATEGORICAL\"]) + \\\n list(features_types_group[\"DUMMIES\"])\n\nfeatures[\"train_indep\"], method_args = preprocess.transform_df(df=features[\"train_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\nfeatures[\"test_indep\"], _ = preprocess.transform_df(df=features[\"test_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\n\n# print(\"Metod arguments:\", method_args)\n\n\n# Tranformation: Yeo-Johnson\n# Note:: It is highly resource intensive\n\n\ntransform_type = \"yeo_johnson\"\nkwargs = {\"lmbda\": -0.5, \"derivative\": 0,\n \"epsilon\": np.finfo(np.float).eps, \"inverse\": False}\nmethod_args = dict()\nexcludes = list(features_types_group[\"CATEGORICAL\"]) + \\\n list(features_types_group[\"DUMMIES\"])\n\nfeatures[\"train_indep\"], method_args = preprocess.transform_df(df=features[\"train_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\nfeatures[\"test_indep\"], _ = preprocess.transform_df(df=features[\"test_indep\"], excludes=excludes,\n transform_type=transform_type, threaded=False,\n method_args=method_args, **kwargs)\n\n# print(\"Metod arguments:\", method_args)\n\n\n# Visual verification\n\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n# ### 6.6. Summary Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n\n# Statsistics report for 'Categorical', 'Continuous', & 'TARGET' variables\n# columns\nfile_name = \"Step_06_6_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_06_6_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_6_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_06_6_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_06_6_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n#

\n\n# ## 7. Rank & Select Features\n\n# Configure: the general settings\n\n\n# select the target variable\ntarget_feature = \"label365\" # \"label30\", \"label365\"\n\n# number of trials\nnum_trials = 1\n\nmodel_rank = dict()\no_summaries_df = dict()\n\n\n# ### 7.1. Define\n\n# Ranking Method: Random forest classifier (Brieman)\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n\ndef rank_random_forest_brieman(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": False, \"class_weight\": None})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 50, \"min_samples_leaf\": 25,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": False, \"class_weight\": None})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_random_forest_breiman(\n features_indep_arg.values, features_target_arg.values,\n **{\"n_estimators\": 10, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 40, \"min_samples_leaf\": 20,\n \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto', \"max_leaf_nodes\": None, \"bootstrap\": True,\n \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None, \"verbose\": 0, \"warm_start\": True, \"class_weight\": None})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(\n model_rank[i].feature_importances_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# Ranking Method: Gradient Boosted Regression Trees (GBRT)\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n\ndef rank_gbrt(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 10, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 5, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_tree_gbrt(\n features_indep_arg.values, features_target_arg.values,\n **{\"loss\": 'ls', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 2, \"min_samples_leaf\": 1,\n \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 3, \"init\": None, \"random_state\": None, \"max_features\": None, \"alpha\": 0.9,\n \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": True})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(\n model_rank[i].feature_importances_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# Ranking Method: Randomized Logistic Regression\n#
Define a set of classifiers with different settings, to be used in feature ranking trials.\n\n\ndef rank_randLogit(features_indep_arg, features_target_arg, num_trials):\n num_settings = 3\n o_summaries_df = [pd.DataFrame({'Name': list(\n features_indep_arg.columns.values)}) for _ in range(num_trials * num_settings)]\n model_rank = [None] * (num_trials * num_settings)\n\n # trials\n for i in range(num_trials):\n print(\"Trial: \" + str(i))\n # setting-1\n s_i = i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.75, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n # setting-2\n s_i = num_trials + i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.50, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n # setting-3\n s_i = (num_trials * 2) + i\n model_rank[s_i] = feature_selection.rank_random_logistic_regression(\n features_indep_arg.values, features_target_arg.values,\n **{\"C\": 1, \"scaling\": 0.5, \"sample_fraction\": 0.90, \"n_resampling\": 200, \"selection_threshold\": 0.25, \"tol\": 0.001,\n \"fit_intercept\": True, \"verbose\": False, \"normalize\": True, \"random_state\": None, \"n_jobs\": 1, \"pre_dispatch\": '3*n_jobs'})\n\n for i in range((num_trials * num_settings)):\n o_summaries_df[i]['Importance'] = list(model_rank[i].scores_)\n o_summaries_df[i] = o_summaries_df[i].sort_values(\n ['Importance'], ascending=[0])\n o_summaries_df[i] = o_summaries_df[i].reset_index(drop=True)\n o_summaries_df[i]['Order'] = range(\n 1, len(o_summaries_df[i]['Importance']) + 1)\n return model_rank, o_summaries_df\n\n\n# ### 7.2. Run\n\n# Run one or more feature ranking methods and trials\n\n# Ranking Method: Random forest classifier (Brieman)\n# Note:: It is moderately resource intensive\n\n\nrank_model = \"rfc\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_random_forest_brieman(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# Ranking Method: Gradient Boosted Regression Trees (GBRT)\n# Note:: It is moderately resource intensive\n\n\nrank_model = \"gbrt\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_gbrt(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# Ranking Method: Randomized Logistic Regression\n# Note:: It is moderately resource intensive\n\n\nrank_model = \"randLogit\"\nmodel_rank[rank_model] = dict()\no_summaries_df[rank_model] = dict()\nmodel_rank[rank_model], o_summaries_df[rank_model] = rank_randLogit(\n features[\"train_indep\"], features[\"train_target\"][target_feature], num_trials)\n\n\n# ### 7.3. Summaries\n\n\n# combine scores\ndef rank_summarise(features_arg, o_summaries_df_arg):\n summaries_temp = {'Order_avg': [], 'Order_max': [],\n 'Order_min': [], 'Importance_avg': []}\n summary_order = []\n summary_importance = []\n\n for f_name in list(features_arg.columns.values):\n for i in range(len(o_summaries_df_arg)):\n summary_order.append(\n o_summaries_df_arg[i][o_summaries_df_arg[i]['Name'] == f_name]['Order'].values)\n summary_importance.append(\n o_summaries_df_arg[i][o_summaries_df_arg[i]['Name'] == f_name]['Importance'].values)\n\n summaries_temp['Order_avg'].append(\n statistics.mean(np.concatenate(summary_order)))\n summaries_temp['Order_max'].append(max(np.concatenate(summary_order)))\n summaries_temp['Order_min'].append(min(np.concatenate(summary_order)))\n summaries_temp['Importance_avg'].append(\n statistics.mean(np.concatenate(summary_importance)))\n\n summaries_df = pd.DataFrame({'Name': list(features_arg.columns.values)})\n summaries_df['Order_avg'] = summaries_temp['Order_avg']\n summaries_df['Order_max'] = summaries_temp['Order_max']\n summaries_df['Order_min'] = summaries_temp['Order_min']\n summaries_df['Importance_avg'] = summaries_temp['Importance_avg']\n summaries_df = summaries_df.sort_values(['Order_avg'], ascending=[1])\n return summaries_df\n\n\n# combine scores\nsummaries_df = dict()\n\nfor rank_model in o_summaries_df.keys():\n summaries_df[rank_model] = dict()\n summaries_df[rank_model] = rank_summarise(\n features[\"train_indep\"], o_summaries_df[rank_model])\n\n\n# Save\n\n\nfor rank_model in model_rank.keys():\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n readers_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=model_rank[rank_model])\n\n file_name = \"Step_07_Model_Train_model_rank_summaries_\" + rank_model\n readers_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=o_summaries_df[rank_model])\n\n\n# ### 7.4. Select Top Features\n\n# Configure: the selection method\n\n\nrank_model = \"rfc\"\nfile_name = \"Step_07_Top_Features_\" + rank_model\nrank_top_features_max = 400\nrank_top_features_score_min = 0.1 * (10 ^ -20)\n\n# sort features\nfeatures_names_selected = summaries_df[rank_model]['Name'][summaries_df[rank_model]\n ['Order_avg'] >= rank_top_features_score_min]\nfeatures_names_selected = (\n features_names_selected[0:rank_top_features_max]).tolist()\n\n\n# Save\n\n\n# save to CSV\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=features_names_selected, append=False, header=False)\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\nprint(\"List of sorted features, which can be modified:\\n \" +\n CONSTANTS.io_path + file_name + \"csv\")\n\n\n# Configure: the selected feature manually if it isnecessary!\n\n\nfile_name = \"Step_07_Top_Features_rfc_adhoc\"\n\nfeatures_names_selected = readers_writers.load_csv(\n path=CONSTANTS.io_path, title=file_name, dataframing=False)[0]\nfeatures_names_selected = [f.replace(\"\\n\", \"\")\n for f in features_names_selected]\ndisplay(pd.DataFrame(features_names_selected))\n\n\n# Verify the top features visually\n\n\n# print\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns),\n \";\\nNumber of top columns: \", len(features[\"train_indep\"][features_names_selected].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"][features_names_selected]), \", test: \", len(\n features[\"test_indep\"][features_names_selected]), \"}\")\n\n\n# ### 7.5. Summary Statistics\n\n# Produce a descriptive stat report of 'Categorical', 'Continuous', & 'TARGET' features\n\n\n# columns\nfile_name = \"Step_07_Data_ColumnNames_Train\"\nreaders_writers.save_csv(path=CONSTANTS.io_path, title=file_name,\n data=list(features[\"train_indep\"][features_names_selected].columns.values), append=False)\n\n# Sample - Train\nfile_name = \"Step_07_Stats_Categorical_Train\"\no_stats = preprocess.stats_discrete_df(df=features[\"train_indep\"][features_names_selected], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_07_Stats_Continuous_Train\"\no_stats = preprocess.stats_continuous_df(df=features[\"train_indep\"][features_names_selected], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n# Sample - Test\nfile_name = \"Step_07_Stats_Categorical_Test\"\no_stats = preprocess.stats_discrete_df(df=features[\"test_indep\"][features_names_selected], includes=features_types_group[\"CATEGORICAL\"],\n file_name=file_name)\nfile_name = \"Step_07_Stats_Continuous_Test\"\no_stats = preprocess.stats_continuous_df(df=features[\"test_indep\"][features_names_selected], includes=features_types_group[\"CONTINUOUS\"],\n file_name=file_name)\n\n\n# ### 7.6. Save Features\n\n\nfile_name = \"Step_07_Features\"\nreaders_writers.save_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name, objects=features)\n\n# print\nprint(\"File size: \", os.stat(os.path.join(\n CONSTANTS.io_path, file_name + \".bz2\")).st_size)\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n#

\n\n#

\n\n# ## 8. Model\n\n# Load a Saved Samples and Features Ranking:\n#
It is an optional step. The step loads the serialised & compressed outputs of Step-7.\n\n\n# open fetures\nfile_name = \"Step_07_Features\"\nfeatures = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n# print\nprint(\"File size: \", os.stat(os.path.join(\n CONSTANTS.io_path, file_name + \".bz2\")).st_size)\nprint(\"Number of columns: \", len(features[\"train_indep\"].columns))\nprint(\"features: {train: \", len(features[\"train_indep\"]), \", test: \", len(\n features[\"test_indep\"]), \"}\")\n\n\n# open scoring model files\nrank_models = [\"rfc\", \"gbrt\", \"randLogit\"]\nmodel_rank = dict()\no_summaries_df = dict()\n\nfor rank_model in rank_models:\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n if not readers_writers.exists_serialised(path=CONSTANTS.io_path, title=file_name, ext=\"bz2\"):\n continue\n\n file_name = \"Step_07_Model_Train_model_rank_\" + rank_model\n model_rank[rank_model] = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n file_name = \"Step_07_Model_Train_model_rank_summaries_\" + rank_model\n o_summaries_df[rank_model] = readers_writers.load_serialised_compressed(\n path=CONSTANTS.io_path, title=file_name)\n\n\n# Verify features visually\n\n\ndisplay(pd.concat([features[\"train_id\"].head(\n), features[\"train_target\"].head(), features[\"train_indep\"].head()], axis=1))\ndisplay(pd.concat([features[\"test_id\"].head(\n), features[\"test_target\"].head(), features[\"test_indep\"].head()], axis=1))\n\n\n#

\n\n# ### 8.1. Initialise\n\n# #### 8.1.1. Algorithms\n\n# Configure: the trianing algorithm\n\n# Algorithm 1: Random Forest\n\n\nmethod_name = \"rfc\"\nkwargs = {\"n_estimators\": 20, \"criterion\": 'gini', \"max_depth\": None, \"min_samples_split\": 100,\n \"min_samples_leaf\": 50, \"min_weight_fraction_leaf\": 0.0, \"max_features\": 'auto',\n \"max_leaf_nodes\": None, \"bootstrap\": True, \"oob_score\": False, \"n_jobs\": -1, \"random_state\": None,\n \"verbose\": 0, \"warm_start\": False, \"class_weight\": \"balanced_subsample\"}\n\n\n# Algorithm 2: Logistic Regression\n\n\nmethod_name = \"lr\"\nkwargs = {\"penalty\": 'l1', \"dual\": False, \"tol\": 0.0001, \"C\": 1, \"fit_intercept\": True, \"intercept_scaling\": 1,\n \"class_weight\": None, \"random_state\": None, \"solver\": 'liblinear', \"max_iter\": 100, \"multi_class\": 'ovr',\n \"verbose\": 0, \"warm_start\": False, \"n_jobs\": -1}\n\n\n# Algorithm 3: Logistic Cross-Validation\n\n\nmethod_name = \"lr_cv\"\nkwargs = {\"Cs\": 10, \"fit_intercept\": True, \"cv\": None, \"dual\": False, \"penalty\": 'l2', \"scoring\": None,\n \"solver\": 'lbfgs', \"tol\": 0.0001, \"max_iter\": 10, \"class_weight\": None, \"n_jobs\": -1, \"verbose\": 0,\n \"refit\": True, \"intercept_scaling\": 1.0, \"multi_class\": \"ovr\", \"random_state\": None}\n\n\n# Algorithm 4: Neural Network\n\n\nmethod_name = \"nn\"\nkwargs = {\"solver\": 'lbfgs', \"alpha\": 1e-5,\n \"hidden_layer_sizes\": (5, 2), \"random_state\": 1}\n\n\n# Algorithm 5: k-Nearest Neighbourhood\n\n\nmethod_name = \"knc\"\nkwargs = {\"n_neighbors\": 5, \"weights\": 'distance', \"algorithm\": 'auto', \"leaf_size\": 30,\n \"p\": 2, \"metric\": 'minkowski', \"metric_params\": None, \"n_jobs\": -1}\n\n\n# Algorithm 6: Decision Tree\n\n\nmethod_name = \"dtc\"\nkwargs = {\"criterion\": 'gini', \"splitter\": 'best', \"max_depth\": None, \"min_samples_split\": 30,\n \"min_samples_leaf\": 30, \"min_weight_fraction_leaf\": 0.0, \"max_features\": None,\n \"random_state\": None, \"max_leaf_nodes\": None, \"class_weight\": None, \"presort\": False}\n\n\n# Algorithm 7: Gradient Boosting Classifier\n\n\nmethod_name = \"gbc\"\nkwargs = {\"loss\": 'deviance', \"learning_rate\": 0.1, \"n_estimators\": 100, \"subsample\": 1.0, \"min_samples_split\": 30,\n \"min_samples_leaf\": 30, \"min_weight_fraction_leaf\": 0.0, \"max_depth\": 3, \"init\": None, \"random_state\": None,\n \"max_features\": None, \"verbose\": 0, \"max_leaf_nodes\": None, \"warm_start\": False, \"presort\": 'auto'}\n\n\n# Algorithm 8: Naive Bayes
\n# Note: features must be positive\n\n\nmethod_name = \"nb\"\ntraining_method = TrainingMethod(method_name)\nkwargs = {\"alpha\": 1.0, \"fit_prior\": True, \"class_prior\": None}\n\n\n#

\n\n# #### 8.1.2. Other Settings\n\n# Configure: other modelling settings\n\n\n# select the target variable\ntarget_feature = \"label365\" # \"label30\" , \"label365\"\n\n# file name\nfile_name = \"Step_09_Model_\" + method_name + \"_\" + target_feature\n\n# initialise\ntraining_method = TrainingMethod(method_name)\n\n\n# #### 8.1.3. Features\n\n\n# features[\"train_indep\"][features_names_selected], features[\"train_indep\"]\nsample_train = features[\"train_indep\"][features_names_selected]\n# features[\"train_target\"][target_feature]\nsample_train_target = features[\"train_target\"][target_feature]\n# features[\"test_indep\"][features_names_selected], features[\"test_indep\"]\nsample_test = features[\"test_indep\"][features_names_selected]\n# features[\"test_target\"][target_feature]\nsample_test_target = features[\"test_target\"][target_feature]\n\n\n# ### 8.3. Fit\n\n\n\no_summaries = dict()\n# Fit\n", "project_metadata": {"full_name": "mesgarpour/T-CARER", "description": "Temporal-Comorbidity Adjusted Risk of Emergency Readmission", "topics": [], "git_url": "git://github.com/mesgarpour/T-CARER.git", "stars": 6, "watchers": 6, "forks": 4, "created": "2016-01-06T10:50:35Z", "size": 3838, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 289989, "Python": 275986}, "last_updated": "2020-08-14T17:23:12Z"}, "intent": "# Fit the model using the train sample"}, {"original_comment": "# Make predictions on the test data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # \ud83d\ude9c Predicting the Sale Price of Bulldozers using Machine Learning\n#\n# In this notebook, we're going to go through an example machine learning project with the goal of predicting the sale price of bulldozers.\n#\n# ## 1. Problem defition\n#\n# > How well can we predict the future sale price of a bulldozer, given its characteristics and previous examples of how much similar bulldozers have been sold for?\n#\n# ## 2. Data\n#\n# The data is downloaded from the Kaggle Bluebook for Bulldozers competition: https://www.kaggle.com/c/bluebook-for-bulldozers/data\n#\n# There are 3 main datasets:\n#\n# * Train.csv is the training set, which contains data through the end of 2011.\n# * Valid.csv is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.\n# * Test.csv is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.\n#\n# ## 3. Evaluation\n#\n# The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.\n#\n# For more on the evaluation of this project check: https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation\n#\n# **Note:** The goal for most regression evaluation metrics is to minimize the error. For example, our goal for this project will be to build a machine learning model which minimises RMSLE.\n#\n# ## 4. Features\n#\n# Kaggle provides a data dictionary detailing all of the features of the dataset. You can view this data dictionary on Google Sheets: https://docs.google.com/spreadsheets/d/18ly-bLR8sbDJLITkWG7ozKm8l3RyieQ2Fpgix-beSYI/edit?usp=sharing\n\n#%%\n\nfrom sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score\nfrom sklearn.ensemble import RandomForestRegressor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport sklearn\n\n#%%\n\n# Import training and validation sets\ndf = pd.read_csv(\"data/TrainAndValid.csv\",\n low_memory=False)\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.isna().sum()\n\n#%%\n\ndf.columns\n\n#%%\n\nfig, ax = plt.subplots()\nax.scatter(df[\"saledate\"][:1000], df[\"SalePrice\"][:1000])\n\n#%%\n\ndf.saledate[:1000]\n\n#%%\n\ndf.saledate.dtype\n\n#%%\n\ndf.SalePrice.plot.hist()\n\n\n# ### Parsing dates\n#\n# When we work with time series data, we want to enrich the time & date component as much as possible.\n#\n# We can do that by telling pandas which of our columns has dates in it using the `parse_dates` parameter.\n\n#%%\n\n# Import data again but this time parse dates\ndf = pd.read_csv(\"data/TrainAndValid.csv\",\n low_memory=False,\n parse_dates=[\"saledate\"])\n\n#%%\n\ndf.saledate.dtype\n\n#%%\n\ndf.saledate[:1000]\n\n#%%\n\nfig, ax = plt.subplots()\nax.scatter(df[\"saledate\"][:1000], df[\"SalePrice\"][:1000])\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.head().T\n\n#%%\n\ndf.saledate.head(20)\n\n\n# ### Sort DataFrame by saledate\n#\n# When working with time series data, it's a good idea to sort it by date.\n\n#%%\n\n# Sort DataFrame in date order\ndf.sort_values(by=[\"saledate\"], inplace=True, ascending=True)\ndf.saledate.head(20)\n\n\n# ### Make a copy of the original DataFrame\n#\n# We make a copy of the original dataframe so when we manipulate the copy, we've still got our original data.\n\n#%%\n\n# Make a copy of the original DataFrame to perform edits on\ndf_tmp = df.copy()\n\n\n# ### Add datetime parameters for `saledate` column\n\n#%%\n\ndf_tmp[\"saleYear\"] = df_tmp.saledate.dt.year\ndf_tmp[\"saleMonth\"] = df_tmp.saledate.dt.month\ndf_tmp[\"saleDay\"] = df_tmp.saledate.dt.day\ndf_tmp[\"saleDayOfWeek\"] = df_tmp.saledate.dt.dayofweek\ndf_tmp[\"saleDayOfYear\"] = df_tmp.saledate.dt.dayofyear\n\n#%%\n\ndf_tmp.head().T\n\n#%%\n\n# Now we've enriched our DataFrame with date time features, we can remove 'saledate'\ndf_tmp.drop(\"saledate\", axis=1, inplace=True)\n\n#%%\n\n# Check the values of different columns\ndf_tmp.state.value_counts()\n\n#%%\n\ndf_tmp.head()\n\n#%%\n\nlen(df_tmp)\n\n\n# ## 5. Modelling\n#\n# We've done enough EDA (we could always do more) but let's start to do some model-driven EDA.\n\n#%%\n\n# Let's build a machine learning model\n\nmodel = RandomForestRegressor(n_jobs=-1,\n random_state=42)\n\nmodel.fit(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])\n\n#%%\n\ndf_tmp.info()\n\n#%%\n\ndf_tmp[\"UsageBand\"].dtype\n\n#%%\n\ndf_tmp.isna().sum()\n\n\n# ### Convert string to categories\n#\n# One way we can turn all of our data into numbers is by converting them into pandas catgories.\n#\n# We can check the different datatypes compatible with pandas here: https://pandas.pydata.org/pandas-docs/stable/reference/general_utility_functions.html#data-types-related-functionality\n\n#%%\n\ndf_tmp.head().T\n\n#%%\n\npd.api.types.is_string_dtype(df_tmp[\"UsageBand\"])\n\n#%%\n\n# Find the columns which contain strings\nfor label, content in df_tmp.items():\n if pd.api.types.is_string_dtype(content):\n print(label)\n\n#%%\n\n# If you're wondering what df.items() does, here's an example\nrandom_dict = {\"key1\": \"hello\",\n \"key2\": \"world!\"}\n\nfor key, value in random_dict.items():\n print(f\"this is a key: {key}\",\n f\"this is a value: {value}\")\n\n#%%\n\n# This will turn all of the string value into category values\nfor label, content in df_tmp.items():\n if pd.api.types.is_string_dtype(content):\n df_tmp[label] = content.astype(\"category\").cat.as_ordered()\n\n#%%\n\ndf_tmp.info()\n\n#%%\n\ndf_tmp.state.cat.categories\n\n#%%\n\ndf_tmp.state.cat.codes\n\n\n# Thanks to pandas Categories we now have a way to access all of our data in the form of numbers.\n#\n# But we still have a bunch of missing data...\n\n#%%\n\n# Check missing data\ndf_tmp.isnull().sum()/len(df_tmp)\n\n\n# ### Save preprocessed data\n\n#%%\n\n# Export current tmp dataframe\ndf_tmp.to_csv(\"data/train_tmp.csv\",\n index=False)\n\n#%%\n\n# Import preprocessed data\ndf_tmp = pd.read_csv(\"data/train_tmp.csv\",\n low_memory=False)\ndf_tmp.head().T\n\n#%%\n\ndf_tmp.isna().sum()\n\n\n# ## Fill missing values\n#\n# ### Fill numerical missing values first\n\n#%%\n\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n print(label)\n\n#%%\n\ndf_tmp.ModelID\n\n#%%\n\n# Check for which numeric columns have null values\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n print(label)\n\n#%%\n\n# Fill numeric rows with the median\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n # Add a binary column which tells us if the data was missing or not\n df_tmp[label+\"_is_missing\"] = pd.isnull(content)\n # Fill missing numeric values with median\n df_tmp[label] = content.fillna(content.median())\n\n#%%\n\n# Demonstrate how median is more robust than mean\nhundreds = np.full((1000,), 100)\nhundreds_billion = np.append(hundreds, 1000000000)\nnp.mean(hundreds), np.mean(hundreds_billion), np.median(\n hundreds), np.median(hundreds_billion)\n\n#%%\n\n# Check if there's any null numeric values\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n print(label)\n\n#%%\n\n# Check to see how many examples were missing\ndf_tmp.auctioneerID_is_missing.value_counts()\n\n#%%\n\ndf_tmp.isna().sum()\n\n\n# ### Filling and turning categorical variables into numbers\n\n#%%\n\n# Check for columns which aren't numeric\nfor label, content in df_tmp.items():\n if not pd.api.types.is_numeric_dtype(content):\n print(label)\n\n#%%\n\n# Turn categorical variables into numbers and fill missing\nfor label, content in df_tmp.items():\n if not pd.api.types.is_numeric_dtype(content):\n # Add binary column to indicate whether sample had missing value\n df_tmp[label+\"_is_missing\"] = pd.isnull(content)\n # Turn categories into numbers and add +1\n df_tmp[label] = pd.Categorical(content).codes+1\n\n#%%\n\n# + 1 to turn -1 to 0, so we know 0 is missing value\npd.Categorical(df_tmp[\"state\"]).codes+1\n\n#%%\n\ndf_tmp.info()\n\n#%%\n\ndf_tmp.head().T\n\n#%%\n\ndf_tmp.isna().sum()\n\n\n# Now that all of data is numeric as well as our dataframe has no missing values, we should be able to build a machine learning model.\n\n#%%\n\ndf_tmp.head()\n\n#%%\n\nlen(df_tmp)\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n '# Instantiate model\\nmodel = RandomForestRegressor(n_jobs=-1,\\n random_state=42)\\n\\n# Fit the model\\nmodel.fit(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])')\n\n#%%\n\n# Score the model\nmodel.score(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])\n\n\n# **Question:** Why doesn't the above metric hold water? (why isn't the metric reliable)\n\n# ### Splitting data into train/validation sets\n\n#%%\n\ndf_tmp.saleYear\n\n#%%\n\ndf_tmp.saleYear.value_counts()\n\n#%%\n\n# Split data into training and validation\ndf_val = df_tmp[df_tmp.saleYear == 2012]\ndf_train = df_tmp[df_tmp.saleYear != 2012]\n\nlen(df_val), len(df_train)\n\n#%%\n\n# Split data into X & y\nX_train, y_train = df_train.drop(\"SalePrice\", axis=1), df_train.SalePrice\nX_valid, y_valid = df_val.drop(\"SalePrice\", axis=1), df_val.SalePrice\n\nX_train.shape, y_train.shape, X_valid.shape, y_valid.shape\n\n#%%\n\ny_train\n\n\n# ### Building an evaluation function\n\n#%%\n\n# Create evaluation function (the competition uses RMSLE)\n\n\ndef rmsle(y_test, y_preds):\n \"\"\"\n Caculates root mean squared log error between predictions and\n true labels.\n \"\"\"\n return np.sqrt(mean_squared_log_error(y_test, y_preds))\n\n# Create function to evaluate model on a few different levels\n\n\ndef show_scores(model):\n train_preds = model.predict(X_train)\n val_preds = model.predict(X_valid)\n scores = {\"Training MAE\": mean_absolute_error(y_train, train_preds),\n \"Valid MAE\": mean_absolute_error(y_valid, val_preds),\n \"Training RMSLE\": rmsle(y_train, train_preds),\n \"Valid RMSLE\": rmsle(y_valid, val_preds),\n \"Training R^2\": r2_score(y_train, train_preds),\n \"Valid R^2\": r2_score(y_valid, val_preds)}\n return scores\n\n\n# ## Testing our model on a subset (to tune the hyperparameters)\n\n#%%\n\n# # This takes far too long... for experimenting\n\n# %%time\n# model = RandomForestRegressor(n_jobs=-1,\n# random_state=42)\n\n# model.fit(X_train, y_train)\n\n#%%\n\nlen(X_train)\n\n#%%\n\n# Change max_samples value\nmodel = RandomForestRegressor(n_jobs=-1,\n random_state=42,\n max_samples=10000)\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n '# Cutting down on the max number of samples each estimator can see improves training time\\nmodel.fit(X_train, y_train)')\n\n#%%\n\n# original dataset size = X_train.shape[0] * 100\n# new dataset size = 10000 * 100\n# 40 times smaller\n(X_train.shape[0] * 100) / 1000000\n\n#%%\n\n10000 * 100\n\n#%%\n\nshow_scores(model)\n\n\n# ### Hyerparameter tuning with RandomizedSearchCV\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n 'from sklearn.model_selection import RandomizedSearchCV\\n\\n# Different RandomForestRegressor hyperparameters\\nrf_grid = {\"n_estimators\": np.arange(10, 100, 10),\\n \"max_depth\": [None, 3, 5, 10],\\n \"min_samples_split\": np.arange(2, 20, 2),\\n \"min_samples_leaf\": np.arange(1, 20, 2),\\n \"max_features\": [0.5, 1, \"sqrt\", \"auto\"],\\n \"max_samples\": [10000]}\\n\\n# Instantiate RandomizedSearchCV model\\nrs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,\\n random_state=42),\\n param_distributions=rf_grid,\\n n_iter=2,\\n cv=5,\\n verbose=True)\\n\\n# Fit the RandomizedSearchCV model\\nrs_model.fit(X_train, y_train)')\n\n#%%\n\n# Find the best model hyperparameters\nrs_model.best_params_\n\n#%%\n\n# Evaluate the RandomizedSearch model\nshow_scores(rs_model)\n\n\n# ### Train a model with the best hyperparamters\n#\n# **Note:** These were found after 100 iterations of `RandomizedSearchCV`.\n\n#%%\n\nget_ipython().run_cell_magic('time', '', '\\n# Most ideal hyperparamters\\nideal_model = RandomForestRegressor(n_estimators=40,\\n min_samples_leaf=1,\\n min_samples_split=14,\\n max_features=0.5,\\n n_jobs=-1,\\n max_samples=None,\\n random_state=42) # random state so our results are reproducible\\n\\n# Fit the ideal model\\nideal_model.fit(X_train, y_train)')\n\n#%%\n\n# Scores for ideal_model (trained on all the data)\nshow_scores(ideal_model)\n\n#%%\n\n# Scores on rs_model (only trained on ~10,000 examples)\nshow_scores(rs_model)\n\n\n# ## Make predictions on test data\n\n#%%\n\n# Import the test data\ndf_test = pd.read_csv(\"data/Test.csv\",\n low_memory=False,\n parse_dates=[\"saledate\"])\n\ndf_test.head()\n\n#%%\n\n# Make predictions on the test dataset\ntest_preds = ideal_model.predict(df_test)\n\n\n# ### Preprocessing the data (getting the test dataset in the same format as our training dataset)\n\n#%%\n\ndef preprocess_data(df):\n \"\"\"\n Performs transformations on df and returns transformed df.\n \"\"\"\n df[\"saleYear\"] = df.saledate.dt.year\n df[\"saleMonth\"] = df.saledate.dt.month\n df[\"saleDay\"] = df.saledate.dt.day\n df[\"saleDayOfWeek\"] = df.saledate.dt.dayofweek\n df[\"saleDayOfYear\"] = df.saledate.dt.dayofyear\n\n df.drop(\"saledate\", axis=1, inplace=True)\n\n # Fill the numeric rows with median\n for label, content in df.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n # Add a binary column which tells us if the data was missing or not\n df[label+\"_is_missing\"] = pd.isnull(content)\n # Fill missing numeric values with median\n df[label] = content.fillna(content.median())\n\n # Filled categorical missing data and turn categories into numbers\n if not pd.api.types.is_numeric_dtype(content):\n df[label+\"_is_missing\"] = pd.isnull(content)\n # We add +1 to the category code because pandas encodes missing categories as -1\n df[label] = pd.Categorical(content).codes+1\n\n return df\n\n#%%\n\n# Process the test data\ndf_test = preprocess_data(df_test)\ndf_test.head()\n\n#%%\n\n# Make predictions on updated test data\ntest_preds = ideal_model.predict(df_test)\n\n#%%\n\nX_train.head()\n\n#%%\n\n# We can find how the columns differ using sets\nset(X_train.columns) - set(df_test.columns)\n\n#%%\n\n# Manually adjust df_test to have auctioneerID_is_missing column\ndf_test[\"auctioneerID_is_missing\"] = False\ndf_test.head()\n\n#%%", "target_code": "test_preds = ideal_model.predict(df_test)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # \ud83d\ude9c Predicting the Sale Price of Bulldozers using Machine Learning\n#\n# In this notebook, we're going to go through an example machine learning project with the goal of predicting the sale price of bulldozers.\n#\n# ## 1. Problem defition\n#\n# > How well can we predict the future sale price of a bulldozer, given its characteristics and previous examples of how much similar bulldozers have been sold for?\n#\n# ## 2. Data\n#\n# The data is downloaded from the Kaggle Bluebook for Bulldozers competition: https://www.kaggle.com/c/bluebook-for-bulldozers/data\n#\n# There are 3 main datasets:\n#\n# * Train.csv is the training set, which contains data through the end of 2011.\n# * Valid.csv is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.\n# * Test.csv is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.\n#\n# ## 3. Evaluation\n#\n# The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.\n#\n# For more on the evaluation of this project check: https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation\n#\n# **Note:** The goal for most regression evaluation metrics is to minimize the error. For example, our goal for this project will be to build a machine learning model which minimises RMSLE.\n#\n# ## 4. Features\n#\n# Kaggle provides a data dictionary detailing all of the features of the dataset. You can view this data dictionary on Google Sheets: https://docs.google.com/spreadsheets/d/18ly-bLR8sbDJLITkWG7ozKm8l3RyieQ2Fpgix-beSYI/edit?usp=sharing\n\n\nfrom sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score\nfrom sklearn.ensemble import RandomForestRegressor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport sklearn\n\n\n# Import training and validation sets\ndf = pd.read_csv(\"data/TrainAndValid.csv\",\n low_memory=False)\n\n\ndf.info()\n\n\ndf.isna().sum()\n\n\ndf.columns\n\n\nfig, ax = plt.subplots()\nax.scatter(df[\"saledate\"][:1000], df[\"SalePrice\"][:1000])\n\n\ndf.saledate[:1000]\n\n\ndf.saledate.dtype\n\n\ndf.SalePrice.plot.hist()\n\n\n# ### Parsing dates\n#\n# When we work with time series data, we want to enrich the time & date component as much as possible.\n#\n# We can do that by telling pandas which of our columns has dates in it using the `parse_dates` parameter.\n\n\n# Import data again but this time parse dates\ndf = pd.read_csv(\"data/TrainAndValid.csv\",\n low_memory=False,\n parse_dates=[\"saledate\"])\n\n\ndf.saledate.dtype\n\n\ndf.saledate[:1000]\n\n\nfig, ax = plt.subplots()\nax.scatter(df[\"saledate\"][:1000], df[\"SalePrice\"][:1000])\n\n\ndf.head()\n\n\ndf.head().T\n\n\ndf.saledate.head(20)\n\n\n# ### Sort DataFrame by saledate\n#\n# When working with time series data, it's a good idea to sort it by date.\n\n\n# Sort DataFrame in date order\ndf.sort_values(by=[\"saledate\"], inplace=True, ascending=True)\ndf.saledate.head(20)\n\n\n# ### Make a copy of the original DataFrame\n#\n# We make a copy of the original dataframe so when we manipulate the copy, we've still got our original data.\n\n\n# Make a copy of the original DataFrame to perform edits on\ndf_tmp = df.copy()\n\n\n# ### Add datetime parameters for `saledate` column\n\n\ndf_tmp[\"saleYear\"] = df_tmp.saledate.dt.year\ndf_tmp[\"saleMonth\"] = df_tmp.saledate.dt.month\ndf_tmp[\"saleDay\"] = df_tmp.saledate.dt.day\ndf_tmp[\"saleDayOfWeek\"] = df_tmp.saledate.dt.dayofweek\ndf_tmp[\"saleDayOfYear\"] = df_tmp.saledate.dt.dayofyear\n\n\ndf_tmp.head().T\n\n\n# Now we've enriched our DataFrame with date time features, we can remove 'saledate'\ndf_tmp.drop(\"saledate\", axis=1, inplace=True)\n\n\n# Check the values of different columns\ndf_tmp.state.value_counts()\n\n\ndf_tmp.head()\n\n\nlen(df_tmp)\n\n\n# ## 5. Modelling\n#\n# We've done enough EDA (we could always do more) but let's start to do some model-driven EDA.\n\n\n# Let's build a machine learning model\n\nmodel = RandomForestRegressor(n_jobs=-1,\n random_state=42)\n\nmodel.fit(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])\n\n\ndf_tmp.info()\n\n\ndf_tmp[\"UsageBand\"].dtype\n\n\ndf_tmp.isna().sum()\n\n\n# ### Convert string to categories\n#\n# One way we can turn all of our data into numbers is by converting them into pandas catgories.\n#\n# We can check the different datatypes compatible with pandas here: https://pandas.pydata.org/pandas-docs/stable/reference/general_utility_functions.html#data-types-related-functionality\n\n\ndf_tmp.head().T\n\n\npd.api.types.is_string_dtype(df_tmp[\"UsageBand\"])\n\n\n# Find the columns which contain strings\nfor label, content in df_tmp.items():\n if pd.api.types.is_string_dtype(content):\n print(label)\n\n\n# If you're wondering what df.items() does, here's an example\nrandom_dict = {\"key1\": \"hello\",\n \"key2\": \"world!\"}\n\nfor key, value in random_dict.items():\n print(f\"this is a key: {key}\",\n f\"this is a value: {value}\")\n\n\n# This will turn all of the string value into category values\nfor label, content in df_tmp.items():\n if pd.api.types.is_string_dtype(content):\n df_tmp[label] = content.astype(\"category\").cat.as_ordered()\n\n\ndf_tmp.info()\n\n\ndf_tmp.state.cat.categories\n\n\ndf_tmp.state.cat.codes\n\n\n# Thanks to pandas Categories we now have a way to access all of our data in the form of numbers.\n#\n# But we still have a bunch of missing data...\n\n\n# Check missing data\ndf_tmp.isnull().sum()/len(df_tmp)\n\n\n# ### Save preprocessed data\n\n\n# Export current tmp dataframe\ndf_tmp.to_csv(\"data/train_tmp.csv\",\n index=False)\n\n\n# Import preprocessed data\ndf_tmp = pd.read_csv(\"data/train_tmp.csv\",\n low_memory=False)\ndf_tmp.head().T\n\n\ndf_tmp.isna().sum()\n\n\n# ## Fill missing values\n#\n# ### Fill numerical missing values first\n\n\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n print(label)\n\n\ndf_tmp.ModelID\n\n\n# Check for which numeric columns have null values\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n print(label)\n\n\n# Fill numeric rows with the median\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n # Add a binary column which tells us if the data was missing or not\n df_tmp[label+\"_is_missing\"] = pd.isnull(content)\n # Fill missing numeric values with median\n df_tmp[label] = content.fillna(content.median())\n\n\n# Demonstrate how median is more robust than mean\nhundreds = np.full((1000,), 100)\nhundreds_billion = np.append(hundreds, 1000000000)\nnp.mean(hundreds), np.mean(hundreds_billion), np.median(\n hundreds), np.median(hundreds_billion)\n\n\n# Check if there's any null numeric values\nfor label, content in df_tmp.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n print(label)\n\n\n# Check to see how many examples were missing\ndf_tmp.auctioneerID_is_missing.value_counts()\n\n\ndf_tmp.isna().sum()\n\n\n# ### Filling and turning categorical variables into numbers\n\n\n# Check for columns which aren't numeric\nfor label, content in df_tmp.items():\n if not pd.api.types.is_numeric_dtype(content):\n print(label)\n\n\n# Turn categorical variables into numbers and fill missing\nfor label, content in df_tmp.items():\n if not pd.api.types.is_numeric_dtype(content):\n # Add binary column to indicate whether sample had missing value\n df_tmp[label+\"_is_missing\"] = pd.isnull(content)\n # Turn categories into numbers and add +1\n df_tmp[label] = pd.Categorical(content).codes+1\n\n\n# + 1 to turn -1 to 0, so we know 0 is missing value\npd.Categorical(df_tmp[\"state\"]).codes+1\n\n\ndf_tmp.info()\n\n\ndf_tmp.head().T\n\n\ndf_tmp.isna().sum()\n\n\n# Now that all of data is numeric as well as our dataframe has no missing values, we should be able to build a machine learning model.\n\n\ndf_tmp.head()\n\n\nlen(df_tmp)\n\n\nget_ipython().run_cell_magic('time', '',\n '# Instantiate model\\nmodel = RandomForestRegressor(n_jobs=-1,\\n random_state=42)\\n\\n# Fit the model\\nmodel.fit(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])')\n\n\n# Score the model\nmodel.score(df_tmp.drop(\"SalePrice\", axis=1), df_tmp[\"SalePrice\"])\n\n\n# **Question:** Why doesn't the above metric hold water? (why isn't the metric reliable)\n\n# ### Splitting data into train/validation sets\n\n\ndf_tmp.saleYear\n\n\ndf_tmp.saleYear.value_counts()\n\n\n# Split data into training and validation\ndf_val = df_tmp[df_tmp.saleYear == 2012]\ndf_train = df_tmp[df_tmp.saleYear != 2012]\n\nlen(df_val), len(df_train)\n\n\n# Split data into X & y\nX_train, y_train = df_train.drop(\"SalePrice\", axis=1), df_train.SalePrice\nX_valid, y_valid = df_val.drop(\"SalePrice\", axis=1), df_val.SalePrice\n\nX_train.shape, y_train.shape, X_valid.shape, y_valid.shape\n\n\ny_train\n\n\n# ### Building an evaluation function\n\n\n# Create evaluation function (the competition uses RMSLE)\n\n\ndef rmsle(y_test, y_preds):\n \"\"\"\n Caculates root mean squared log error between predictions and\n true labels.\n \"\"\"\n return np.sqrt(mean_squared_log_error(y_test, y_preds))\n\n# Create function to evaluate model on a few different levels\n\n\ndef show_scores(model):\n train_preds = model.predict(X_train)\n val_preds = model.predict(X_valid)\n scores = {\"Training MAE\": mean_absolute_error(y_train, train_preds),\n \"Valid MAE\": mean_absolute_error(y_valid, val_preds),\n \"Training RMSLE\": rmsle(y_train, train_preds),\n \"Valid RMSLE\": rmsle(y_valid, val_preds),\n \"Training R^2\": r2_score(y_train, train_preds),\n \"Valid R^2\": r2_score(y_valid, val_preds)}\n return scores\n\n\n# ## Testing our model on a subset (to tune the hyperparameters)\n\n\n# # This takes far too long... for experimenting\n\n# %%time\n# model = RandomForestRegressor(n_jobs=-1,\n# random_state=42)\n\n# model.fit(X_train, y_train)\n\n\nlen(X_train)\n\n\n# Change max_samples value\nmodel = RandomForestRegressor(n_jobs=-1,\n random_state=42,\n max_samples=10000)\n\n\nget_ipython().run_cell_magic('time', '',\n '# Cutting down on the max number of samples each estimator can see improves training time\\nmodel.fit(X_train, y_train)')\n\n\n# original dataset size = X_train.shape[0] * 100\n# new dataset size = 10000 * 100\n# 40 times smaller\n(X_train.shape[0] * 100) / 1000000\n\n\n10000 * 100\n\n\nshow_scores(model)\n\n\n# ### Hyerparameter tuning with RandomizedSearchCV\n\n\nget_ipython().run_cell_magic('time', '',\n 'from sklearn.model_selection import RandomizedSearchCV\\n\\n# Different RandomForestRegressor hyperparameters\\nrf_grid = {\"n_estimators\": np.arange(10, 100, 10),\\n \"max_depth\": [None, 3, 5, 10],\\n \"min_samples_split\": np.arange(2, 20, 2),\\n \"min_samples_leaf\": np.arange(1, 20, 2),\\n \"max_features\": [0.5, 1, \"sqrt\", \"auto\"],\\n \"max_samples\": [10000]}\\n\\n# Instantiate RandomizedSearchCV model\\nrs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,\\n random_state=42),\\n param_distributions=rf_grid,\\n n_iter=2,\\n cv=5,\\n verbose=True)\\n\\n# Fit the RandomizedSearchCV model\\nrs_model.fit(X_train, y_train)')\n\n\n# Find the best model hyperparameters\nrs_model.best_params_\n\n\n# Evaluate the RandomizedSearch model\nshow_scores(rs_model)\n\n\n# ### Train a model with the best hyperparamters\n#\n# **Note:** These were found after 100 iterations of `RandomizedSearchCV`.\n\n\nget_ipython().run_cell_magic('time', '', '\\n# Most ideal hyperparamters\\nideal_model = RandomForestRegressor(n_estimators=40,\\n min_samples_leaf=1,\\n min_samples_split=14,\\n max_features=0.5,\\n n_jobs=-1,\\n max_samples=None,\\n random_state=42) # random state so our results are reproducible\\n\\n# Fit the ideal model\\nideal_model.fit(X_train, y_train)')\n\n\n# Scores for ideal_model (trained on all the data)\nshow_scores(ideal_model)\n\n\n# Scores on rs_model (only trained on ~10,000 examples)\nshow_scores(rs_model)\n\n\n# ## Make predictions on test data\n\n\n# Import the test data\ndf_test = pd.read_csv(\"data/Test.csv\",\n low_memory=False,\n parse_dates=[\"saledate\"])\n\ndf_test.head()\n\n\n# Make predictions on the test dataset\ntest_preds = ideal_model.predict(df_test)\n\n\n# ### Preprocessing the data (getting the test dataset in the same format as our training dataset)\n\n\ndef preprocess_data(df):\n \"\"\"\n Performs transformations on df and returns transformed df.\n \"\"\"\n df[\"saleYear\"] = df.saledate.dt.year\n df[\"saleMonth\"] = df.saledate.dt.month\n df[\"saleDay\"] = df.saledate.dt.day\n df[\"saleDayOfWeek\"] = df.saledate.dt.dayofweek\n df[\"saleDayOfYear\"] = df.saledate.dt.dayofyear\n\n df.drop(\"saledate\", axis=1, inplace=True)\n\n # Fill the numeric rows with median\n for label, content in df.items():\n if pd.api.types.is_numeric_dtype(content):\n if pd.isnull(content).sum():\n # Add a binary column which tells us if the data was missing or not\n df[label+\"_is_missing\"] = pd.isnull(content)\n # Fill missing numeric values with median\n df[label] = content.fillna(content.median())\n\n # Filled categorical missing data and turn categories into numbers\n if not pd.api.types.is_numeric_dtype(content):\n df[label+\"_is_missing\"] = pd.isnull(content)\n # We add +1 to the category code because pandas encodes missing categories as -1\n df[label] = pd.Categorical(content).codes+1\n\n return df\n\n\n# Process the test data\ndf_test = preprocess_data(df_test)\ndf_test.head()\n\n\n# Make predictions on updated test data\ntest_preds = ideal_model.predict(df_test)\n\n\nX_train.head()\n\n\n# We can find how the columns differ using sets\nset(X_train.columns) - set(df_test.columns)\n\n\n# Manually adjust df_test to have auctioneerID_is_missing column\ndf_test[\"auctioneerID_is_missing\"] = False\ndf_test.head()\n\n", "project_metadata": {"full_name": "chesterheng/machinelearning-datascience", "description": "Complete Machine Learning and Data Science: Zero to Mastery", "topics": ["machine-learning", "data-science"], "git_url": "git://github.com/chesterheng/machinelearning-datascience.git", "stars": 11, "watchers": 11, "forks": 6, "created": "2020-05-10T09:38:22Z", "size": 81175, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4070181}, "last_updated": "2020-12-07T17:13:28Z"}, "intent": "# Make predictions on the test data"}, {"original_comment": "# The object responsible for performing PCA\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principle Component Analysis\n# *Curtis Miller*\n#\n# **Principle component analysis (PCA)** is a dimensionality reduction technique that finds a feature space such that all features are uncorrelated with each other. Furthermore, the first feature is a linear combination of the observed features that maximizes the variance without \"magnifying\" those features effects. The second feature maximizes variance when the effect of the first feature has been removed from the dataset, and so on, so each feature \"explains\" a diminishing amount of variation in the original dataset. Dimensionality is thus reduced by choosing a smaller number of features found by PCA that account for most of the variation in the original dataset.\n#\n# ## Performing PCA\n#\n# In this notebook I demonstrate dimensionality reduction with PCA on the features of the Boston housing price dataset. First I load in the dataset.\n\n#%%\n\nimport numpy as np\nfrom sklearn.datasets import load_boston", "target_code": "from sklearn.decomposition import PCA\n\npca = PCA(n_components=2) # A two-dimensional representation of the dataset\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principle Component Analysis\n# *Curtis Miller*\n#\n# **Principle component analysis (PCA)** is a dimensionality reduction technique that finds a feature space such that all features are uncorrelated with each other. Furthermore, the first feature is a linear combination of the observed features that maximizes the variance without \"magnifying\" those features effects. The second feature maximizes variance when the effect of the first feature has been removed from the dataset, and so on, so each feature \"explains\" a diminishing amount of variation in the original dataset. Dimensionality is thus reduced by choosing a smaller number of features found by PCA that account for most of the variation in the original dataset.\n#\n# ## Performing PCA\n#\n# In this notebook I demonstrate dimensionality reduction with PCA on the features of the Boston housing price dataset. First I load in the dataset.\n\n\nimport numpy as np\nfrom sklearn.datasets import load_boston\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nboston_obj = load_boston()\ndata = boston_obj.data\ndata[:5, :]\n\n\n", "project_metadata": {"full_name": "PacktPublishing/Training-Systems-Using-Python-Statistical-Modeling", "description": "Training Systems Using Python Statistical Modeling, Published by Packt", "topics": [], "git_url": "git://github.com/PacktPublishing/Training-Systems-Using-Python-Statistical-Modeling.git", "stars": 8, "watchers": 8, "forks": 12, "created": "2019-05-14T05:42:21Z", "size": 9567, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11600689}, "last_updated": "2020-12-14T21:27:33Z"}, "intent": "# The object responsible for performing PCA"}, {"original_comment": "# Make Voronoi graph out of the last particle positions\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Processing of particle tracks\n# ### coded to work with experiments and simulation\n\n# ## _Import of python libraries used within the code_\n#\n# ### load both code cells \n\n#%%\n\n# HI HELLO\nimport ipywidgets as widgets # import\nfrom IPython.display import display, HTML # screen on display in HTML library\nimport matplotlib.colors as mcolors\nfrom matplotlib.collections import PolyCollection\nfrom matplotlib import rc\nfrom scipy import signal # noise filtering functions\nimport time # timing functions (for computation time)\nfrom collections import Counter # (not used)\nimport scipy.optimize as optimize # optimazation library (not used)\nfrom scipy.spatial import Voronoi, voronoi_plot_2d # import Voronoi functions\nimport scipy.special as spec\nimport scipy.stats as st\nimport scipy.constants as ct # import mathematical constants\n# double (or higher) floating point precision library (not used)\nimport mpmath as mp\nfrom sympy import var # symbolic calculation library variables\nimport sympy as sp # symbolic calculation library functions\nimport json\nimport sys\nimport subprocess # these 2 lines below load OS functionality\nimport matplotlib.pyplot as plt # plot library\nimport numpy as np # fast array library\nimport pandas as pd # data frames library\n# plot display type. use this to keep graphs within the notebook\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\nglobal iscaled\niscaled = 0\n\nglobal LX, LY\nLX = 1000 - 250\nLY = 650-50\n\n# system size for a given (npart, rho) configuration\n\n\ndef L(npart, rho):\n l = np.sqrt(npart*2*np.sqrt(3.)/rho)\n return l\n\n# print progress function\n# more elegant (python 3 alternative): print('\\rhello', end='',flush=True)\n\n\ndef printp(string):\n sys.stdout.write('\\r' + str(string))\n sys.stdout.flush()\n\n#from __future__ import print_function\n\n\n# Text format for figures configuration (lines below)\nrc('text', usetex=True)\nplt.rcParams['font.size'] = 24\n# special plotting functions\n\n\ndef set_up_graf(idf, LX, LY):\n global ax, fig_system, ss\n\n fig_system = plt.figure(idf, figsize=(6.5, 6.5*LY/LX))\n ss = (72./fig_system.dpi)**2 # particle size\n ax = fig_system.add_subplot(1, 1, 1)\n ax.set_ylim([0, LY])\n ax.set_xlim([0, LX])\n ax.set_xlabel(r'$x/\\sigma$', fontsize=24)\n ax.set_ylabel(r'$y/\\sigma$', fontsize=24)\n\n#%%\n\ndef progreso(val, vmax):\n\n a = widgets.IntProgress(\n value=val,\n min=0,\n max=vmax,\n step=1,\n description='Loading:',\n bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n orientation='horizontal'\n )\n return a\n\n#%%\n\nprogreso(2, 104)\n\n\n# ## _Read tracking data _\n#\n# We explain the action of each of these functions below:\n#\n# __get_info(hash_prefix) :__ Reads and prints info file with _hash prefix_ hash code. See output table in RUN CELL 0 below to grasp the info structure\n#\n# __simple_pickle_read(hash_prefix) :__\n# This 'simple_pickle_read' takes a pkl.xz file _hash prefix_ hash code, and which contains xy positions and tracks (no velocities) and stores in 'tabla' pandas data frame.\n#\n# * input: hash_prefix\n# * output: global variable _Ntracks_ (number of tracks)\n\n#%%\n\ndef pdisplay(info):\n display(HTML(info.to_html()))\n\n# This function reads pickle binary file (.pkl) with trajectories\n\n\ndef get_info(series_directory, hash_prefix):\n global info\n nombre = './Datos/' + series_directory + \\\n '/info/' + hash_prefix + '_experiment_info.txt'\n with open(nombre) as f:\n jsonstr = json.load(f)\n info = pd.json_normalize(jsonstr)\n pdisplay(info.T)\n return info\n\n\ndef simple_pickle_read(series_directory, hash_prefix):\n global Nframes, Ntracks\n # Read table in pickle format\n nombre = './Datos/' + series_directory + '/tracks/' + \\\n hash_prefix + '_rect_roi_trajectories.pkl.xz'\n tabla = pd.read_pickle(nombre, compression='infer')\n # por si los indices salen desordenados\n tabla = tabla.reset_index(drop=True)\n Nframes = np.max(tabla.frame)+1\n Ntracks = np.max(tabla.track)+1\n print('Number of frames in the movie: ', Nframes)\n print('Maximum track index: ', Ntracks)\n print('\\nFirst 5 rows of imported table:\\n')\n display(HTML(tabla.head().to_html()))\n tabla = tabla.sort_values(by=['frame', 'track']).reset_index(drop=True)\n return tabla\n\n\n# ## _Order experiment series by increasing packing fraction_\n# _(and for constant packing fraction, order by increasing temperature) _\n\n#%%\n\n# Read the pickle format table of experiments series\nlista_experimentos = pd.read_pickle(\n './Datos/info_table.pkl', compression='infer')\n\n# order experiments series table by packing fraction (primary) and temperature (secondary)\nlista_experimentos_ordenada = lista_experimentos.sort_values(\n by=['packing_fraction', 'temperature'])\n\n# Display the complete ordered table; HTML format\n# display(HTML(\n# lista_experimentos_ordenada.to_html()\n# ))\n\n# Export pkl table to Excel\n# lista_experimentos_ordenada.to_excel('info_table.xlsx')\n\n#%%\n\nlista_inf = lista_experimentos_ordenada[lista_experimentos_ordenada.packing_fraction > 0.69]\n\n#%%\n\nlista_save = lista_inf[lista_inf.packing_fraction < 0.99]['experiment_id']\n\n#%%\n\nlista_save.to_csv('rho4.dat', index=False)\n\n#%%\n\nlista_save.to_csv('rho1.dat', index=False)\n\n\n# ## _Reduce global table - functions_\n#\n# __reset_track_indexes(tabla0):__\n# Eliminates 'lonely' tracks (particles tracked for just 1 frame), re-indexes so that no track indexes are empty\n# * input: _tabla0_ original data frame (of tracks)\n# * output: RETURNS clean _tabla_ data frame, without empty tracks; AND original _tabla0_ with empty tracks still included\n# * output structure: _tabla0_, _tabla_\n#\n# __short_drop:(ishort, tabla):__\n# Eliminates tracks detected for n frames or less, re-indexes so that no track indexes are empty\n#\n# * Input: _ishort_ (threshold length of tracks: shorter tracks are not kept), _tabla_ (original tracks data frame)\n#\n# * Output: RETURNS _tabla_\\__short_ (frame), that is the input table _tabla_ without tracks shorter than _ishort_\n#\n\n#%%\n\ndef reset_track_indexes(tabla0):\n global Ntracks, Nframes\n \"\"\" This function takes a dataframe in which some trajectory indexes\n are missing (maybe due to having deleted short trajectories) and\n resets indexes so that we can loop over the tracks with 'range' \"\"\"\n # 'real_number_of_tracks' should be <= than 'current_last_particle_index'\n tabla = tabla0.copy()\n Ntracks = len(set(tabla.track))\n original_indexes = np.sort(list(set(tabla.track)))\n unsort_indexes = original_indexes\n fixed_indexes = np.arange(0, Ntracks, step=1, dtype=int)\n if (original_indexes == fixed_indexes).all() == False: # fix only if there are empty tracks\n # With these two lists we create a dictionary and map old values to new ones\n n_empty = np.max(tabla.track) - Ntracks\n replacement_dict = dict(zip(original_indexes, fixed_indexes))\n tabla.track = tabla.track.map(\n dict(zip(original_indexes, fixed_indexes)))\n print('no. of empty track indexes discarded: ', n_empty, '\\n')\n else:\n print('nothing to fix\\n')\n Ntracks = np.max(tabla.track)+1\n Nframes = np.max(tabla.frame)+1\n return tabla0, tabla\n\n# INPUT\n# ishort: number of minimum frames in a track (eliminates tracks under ishort time length)\n# tabla: pandas Data Frame to shorten\n# OUTPUT\n\n\ndef short_drop(ishort, tabla):\n global shorts_list, Ntracks\n shorts_list = []\n Ntracks = np.max(tabla.track)+1\n for i in range(Ntracks):\n t1 = track(i, tabla, False)\n if len(t1) < ishort+1:\n shorts_list.append(i)\n len0 = len(tabla)\n tabla = tabla.drop(t1['index'])\n texto = 'dropped track no. ' + \\\n str(i)+'; data table length decreased in '+str(len(tabla)-len0)\n printp(texto)\n print('\\n')\n tabla_short = tabla.sort_values(\n by=['frame', 'track']).reset_index(drop=True)\n Nshorts = np.max(tabla_short.track)+1\n printp('Dropped out ' + str(Ntracks-Nshorts) +\n ' short tracks out of ' + str(Ntracks))\n # the line above is necessary so that eliminates index voids and shuffling after short drop\n print('\\n')\n return tabla_short\n\n\n# ## _Get tracks and states from global table - functions_\n#\n# __track(t_id, tabla, dropit) :__ builds track for one particle from the appropriate chunk of the source table, with only the lines for particle _t_\\__id_\n#\n# * Input: _t_\\__id_ indice de track, _tabla_ source frame fuente (contains tracks), _dropit_ boolean; if =True then erases old index column (tracks do not always begin in frame 0); most of the time you just want _dropit_=True\n#\n# * Output: RETURNS a chunk of source table _tabla_ with only the lines for particle _t_\\__id_\n#\n# __all\\___ __tracks(tabla, dropit):__ repeats the process in _track_ function for all existing particles\n#\n# * Input: _tabla_ (frame) data source, _dropit_ boolean variable set True to erase original table original line number\n#\n# * Output: RETURNS _track_ array of frames each wiith one particle track; builds _tr_\\__lengths_ array of tracks lengths\n#\n#\n# __state(it, tabla):__ gets one instantaneous state frome source frame _tabla_, at frame _it_\n#\n# * Input: _it_ frame no. to get the state from; _tabla_ source frame data\n#\n# * Output: RETURNS st frame chunk from _tabla_ source frame\n#\n# __all__\\___states(tabla):__ Builds all instantaneous states from the movie\n#\n# * Input: _tabla_ source frame data\n#\n# * Output: RETURNS array of frame chunks from _tabla_ source frame. each chunk being an instantaneous state\n\n#%%\n\n##### INDIVIDUAL TRACKS ##############\n# build a 't_id' indivitual track\ndef track(t_id, tabla, dropit):\n t1 = tabla.loc[tabla.track == t_id].reset_index(drop=dropit)\n return t1\n\n# build individual tracks from all kept tracks\n# OUTPUT\n# tr_lengths[i]: length of track no. 'i'. The total no. of tracks is stored in 'Ntracks'\n\n\ndef all_tracks(tabla, dropit):\n # length of track\n global tr_lengths\n tr_lengths = np.empty(Ntracks, dtype=int)\n tracks = [[] for i in range(Ntracks)]\n for i in range(Ntracks):\n tracks[i] = track(i, tabla, dropit)\n tr_lengths[i] = int(len(tracks[i]))\n return tracks\n\n##### INSTANTANEOUS STATES ##############\n# BUILD INSTANTANEOUS STATES OF THE SYSTEM\n\n\ndef state(it, tabla):\n st = tabla.loc[tabla.frame == it].reset_index(drop=True)\n # reset row index\n # (otherwise keeps chunked index of the original table)\n return st\n\n# build instantaneous states over all frames\n\n\ndef all_states(tabla):\n sts = [[] for i in range(Nframes)]\n for i in range(Nframes):\n sts[i] = state(i, tabla)\n return sts\n\n\n# ## _Low-Pass filters_\n#\n# __butter_lowpass(step,fps, arr) :__ Applies Butterworth low-pass filter to _arr_ array, with averaging width _step_, for a movie with _fps_ frame rate; _fr_ is the fraction of the maximum frequency that is allowed to pass\n#\n# * Input: _fr_, _step_, _fps_, _arr_; as described above\n# * Output: RETURNS an array of the same size as _arr_\n#\n# __cheby1_lowpass(step,fps, arr) :__ Applies Chebyshev type I low-pass filter to _arr_ array, with averaging width _step_, for a movie with _fps_ frame rate\n#\n# * Input: _step_, _fps_, _arr_; as described above\n# * Output: RETURNS an array of the same size as _arr_\n#\n#\n\n#%%\n\ndef butter_lowpass(fr, step, fps, arr):\n\n #N, Wn = signal.buttord(1./(1.*fps/step),1./fps, 1/step, fps*0.5)\n N, Wn = signal.buttord(fr/step, fr, 1/step, fps*0.5, 0.5/fps)\n b, a = signal.butter(N, Wn, 'low')\n yy = signal.filtfilt(b, a, np.squeeze(arr), padtype=None)\n return yy\n\n\ndef filter_tracks_butter(fr, step, fps, tabla):\n tabla_unfiltered = tabla.copy()\n for i in range(Ntracks):\n printp('Filtering positions for track no. ' +\n str(i+1) + ' of ' + str(Ntracks))\n xbb = np.array(tabla.loc[tabla['track'] == i, 'x'])\n xb = butter_lowpass(fr, step, info.fps[0], xbb)\n tabla.loc[tabla['track'] == i, 'x'] = xb\n ybb = tabla.loc[tabla['track'] == i, 'y']\n yb = butter_lowpass(fr, step, info.fps[0], ybb)\n tabla.loc[tabla['track'] == i, 'y'] = yb\n print('\\n')\n return tabla_unfiltered, tabla\n\n# def cheby1_lowpass(step,fps,arr):\n# N, Wn = signal.cheb1ord(1./(1.*fps/step),1./fps, step, fps*0.5)\n# b, a = signal.cheby1(N, 1./fps, Wn, 'low')\n# y = signal.filtfilt(b, a, arr)\n# return y\n\n\n# ## _Get velocities (differences) and accelerations (differences of differences) from individual tracks_\n#\n# __vels(tabla) :__ Ouputs velocities from tracks table; by decomposing into individual trajectories first\n#\n# * Input: _tabla_ frame data source\n#\n# * Output: RETURNS _tabla_ with 2 new columns (_tabla['vx']_, _tabla['vy']_)\n\n#%%\n\ndef diffs(str01, str02, str1, str2, tabla):\n global last_id\n tabla[str1] = np.zeros(len(tabla))\n tabla[str2] = np.zeros(len(tabla))\n last_id = np.zeros(Ntracks, dtype=int)\n for k in range(Ntracks):\n printp('diffs: ' + str(k+1) + ' of ' + str(Ntracks) + ' tracks')\n t1 = track(k, tabla, False)\n tabla.loc[t1['index'][:-1], str1] = np.diff(t1[str01])\n tabla.loc[t1['index'][:-1], str2] = np.diff(t1[str02])\n last_id[k] = tabla.loc[tabla['track'] == k].index.values[-1:]\n tabla = tabla.drop(last_id)\n print('\\n')\n return tabla\n\n\n# ## Bring data to physical units\n#\n# We take as coordinate origin the mid-point of the max and min X and Y positions ever tracked within the region of interest (ROI) during the experiment.\n#\n# We have 2 options for lengh unit:\n# * milimeters\n# * Ball diameter\n#\n# The time scale unit is seconds.\n#\n# Only after this step position differences in the original data table are re-scaled as real velocities\n\n#%%\n\n# PHYSICAL SCALES AND ORIGIN\n# Usage: re_pos_scale(0) for 1 mm as length unit; re_pos_scale(1) for ball diameter (sigma)\n# length unit\n\ndef set_origin(shiftx, shifty, tabla):\n tabla_not_shifted = tabla.copy()\n tabla['x'] -= shiftx\n tabla['y'] -= shifty\n return tabla_not_shifted, tabla\n\n\ndef scale(l_unit, t_unit, tabla):\n global iscaled\n tabla_not_scaled = tabla.copy()\n if iscaled == 0:\n inv_l_unit = 1./l_unit\n print(inv_l_unit)\n if np.any(tabla.columns == 'vx') or np.any(tabla.columns == 'vy'):\n print(inv_l_unit)\n tabla[['x', 'y']] *= inv_l_unit\n tabla[['vx', 'vy']] *= (inv_l_unit * t_unit)\n if np.any(tabla.columns == 'ax') or np.any(tabla.columns == 'ay'):\n tabla[['ax', 'ay']] *= (inv_l_unit * t_unit**2)\n else:\n tabla[['x', 'y']] *= inv_l_unit\n iscaled = 1\n else:\n tabla[['x', 'y']] *= 1\n iscaled = 0\n return tabla_not_scaled, tabla\n\n\n# ## RUN CELL 0\n# ### - Import pkl.xz tracks table\n# ### - create all individual tracks arrays\n# ### - analyze track length histogram, decide minimum track length\n\n#%%\n\n# COMPLETE SET OF READING INSTRUCTIONS\n\nseries_directory = 'ppp/rho_020_049'\nhash_prefix = '0f9e94e168aa137db93f8b064448c7e2'\n\ninfo = get_info(series_directory, hash_prefix)\n\ndatos = simple_pickle_read(series_directory, hash_prefix)\ndatos_orig, datos = reset_track_indexes(datos)\n\n# Build all tracks, storing them individually\ntracks = all_tracks(datos, True)\nprint('\\n min for positions:\\n', np.min(datos[['x', 'y']]), '\\n')\nprint('max. for positions:\\n', np.max(datos[['x', 'y']]), '\\n')\n\n#%%\n\n# Histogram of trajectory length\nplt.figure(1, figsize=(8, 8/ct.golden))\nplt.title('Trajectory lengths')\n# set upper height limit for histogram\ntrack_lengths = [len(tracks[i]) for i in range(Ntracks)]\nplt.ylim(0, 80)\n# number of bins for tr4ack length histogram\nnbins = 200\n# plot\nplt.ylim(0, 50)\nfig = plt.hist(track_lengths, nbins, color='b', range=(0, 200))\n\n\n# ## RUN CELL 1\n# ### - eliminate short tracks\n# ### - obtain position differences (vels)\n# ### - obtain velocities differences (accels)\n# ### - create all (non-short) individual tracks arrays. (minimum theoretical length: 5)\n\n#%%\n\ntracks_table = [[] for i in range(info.n_frames[0])]\n\n#%%\n\n# discard short tracks\nishort = 10 # minimum length of kept tracks\ntracks_table[ishort] = short_drop(ishort, datos)\ntracks_orig, tracks_table[ishort] = reset_track_indexes(tracks_table[ishort])\niscaled = 0\n\n# (butterworth)-filter particle positions\ntracks_unfiltered, tracks_table[ishort] = filter_tracks_butter(\n 0.8, 4, info.fps[0], tracks_table[ishort])\n\n# calculate velocities for all tracks\n#tracks_table[ishort] = diffs('x','y','vx','vy', tracks_table[ishort]);\n\n# calculate accelerations for all tracks\n#tracks_table[ishort] = diffs('vx','vy','ax','ay', tracks_table[ishort]);\n\n# print table head\nprint('not scaled: \\n')\ndisplay(HTML(tracks_table[ishort].head().to_html()))\n\n\n# scale and re-position the system (left-bottom corner is (0,0))\niscaled = 0\nnot_shifted, tracks_table[ishort] = set_origin(\n np.min(datos.x), np.min(datos.y), tracks_table[ishort]) # re-position\n\nnot_scaled, tracks_table[ishort] = scale(\n info.particle_diameter_px[0], 10**3/info.fps[0], tracks_table[ishort]) # scale\n\n# print table head\nprint('scaled: \\n')\ndisplay(HTML(tracks_table[ishort].head().to_html()))\n\n\n# store tracks individually\n#tracks = all_tracks(tracks_table[ishort], True)\n#\n# store system (instantaneous) states individually\n#states = all_states(tracks_table[ishort])\n\n#%%\n\ntracks_table[ishort].tail()\n\n\n# ## _Plotting functions_\n#\n# __frameit(ax):__ frames a figure by drawing the image limits and the ROI limits as well\n#\n# __plt__\\___track(t_id,xs,tagit,fr\\_it):__ plots just one track (_t_\\__id_) within _fr_\\__it_ (if True) frames, with size _xs_ and prints the track no. if _tagit_ is True\n#\n# __plt__\\___tracks(init_id, final_id,xs,tagit,fr\\_it):__ plots _init_\\__id_ to _final_\\__id_ tracks within _fr_\\__it_ (if True) frames, with size _xs_ and prints the track no. if _tagit_ is True\n\n#%%\n\ndef frameit(ax):\n ax.set_xlim(0, info['shape'][0][0])\n ax.set_ylim(0, info['shape'][0][1])\n rect = plt.Rectangle([250, 50], 750, 600, alpha=1,\n lw=10, fill=False, edgecolor='b')\n ax.add_artist(rect)\n\n# Plot just one track function (tagit?, frameit?)\n\n\ndef plt_track(t_id, xs, tagit, fr_it):\n if fr_it == True:\n fig, ax = plt.subplots(\n figsize=(xs, xs*info['shape'][0][1]/info['shape'][0][0]))\n px_size = 72./fig.dpi\n frameit(ax)\n else:\n fig, ax = plt.subplots()\n px_size = 72./fig.dpi\n\n plt.plot(tracks[t_id].x, tracks[t_id].y, '.',\n c='r', markersize=px_size, linewidth=None)\n if tagit == True:\n plt.text(np.mean(tracks[t_id].x), np.mean(tracks[t_id].y), str(t_id))\n\n\ndef plt_tracks(init_id, final_id, xs, tagit, fr_it):\n if fr_it == True:\n fig, ax = plt.subplots(\n figsize=(xs, xs*info['shape'][0][1]/info['shape'][0][0]))\n px_size = 72./fig.dpi\n frameit(ax)\n else:\n fig, ax = plt.subplots()\n px_size = 72./fig.dpi\n for i in range(init_id, final_id):\n plt.plot(tracks[i].x, tracks[i].y, '.', c='r',\n markersize=px_size, linewidth=None)\n if tagit == True:\n plt.text(np.mean(tracks[i].x), np.mean(tracks[i].y), str(i))\n\n#%%\n\n# the scatter plot:\nset_up_graf(15, LX, LY)\nplt.hist2d(tracks_table[ishort]['x'],\n tracks_table[ishort]['y'], bins=350, cmap='gray')\nplt.clim(-10, 100)\n\n#plt.savefig('hist2d_N60_ap30.jpeg', bbox_inches='tight', quality=100)\n\nss100 = state(100, tracks_table[ishort])\nxy = ss100[['x', 'y']]\nnpart = len(ss100)\n\nX_off = 0.09 # this is the width of the border layer to discard\nY_off = X_off*0.5\n\n# Make Voronoi graph out of the last particle positions\nvor = Voronoi(xy)\n# Eliminate outer cells\ninner = inner_cells(vor, LX=LX, X_off=X_off, LY=LY, Y_off=Y_off)\n# Represent particle positions and inner Voronoi cells\nset_up_graf(16, LX, LY)\ngraf = graf_vor(1, 1, inner, LX, LY)\n\n#plt.savefig('voronoi_N160_ap30.jpeg', bbox_inches='tight', quality=100)\n\n#%%\n\nplt.hist(np.abs(tracks_table[ishort].vx), 100, color='b')\n\n#%%\n\nplt.hist(np.abs(tracks_table[ishort].vy), 100, color='b')\n\n\n# ## _Average in 'grains'_\n#\n# __grain(arr, step) :__ Builds coarse-grained trajectories. Designed for noisy trajectories.\n#\n# * Input: _arr_ array of data to coarse grain; _step_ the coarse-grain size (in time intervals)\n# * Output: RETURNS a data array\n\n#%%\n\ndef grain(arr, step):\n invdt = 1./step\n n_arr = len(arr)\n gr_arr = [[i + 0.5*step, (arr[i+step]-arr[i])*invdt]\n for i in range(0, n_arr-step, step)]\n return np.array(gr_arr)\n\n#%%\n\nNtracks\n\n#%%\n\n# plot just one track, tagit, dont frame it\nplt_track(100, 7, True, False)\n\n#%%\n\n# plot just one track, dont tagit, frame it\nplt_tracks(0, Ntracks, 7, False, True)\n\n#%%\n\n# plot just one track, dont tagit, frame it\nplt_tracks(0, Ntracks, 7, False, True)\n\n#%%\n\n# plot first 40 tracks, tag'em, dont frame them\nplt_tracks(0, 40, 7, True, False)\n\n#%%\n\n# same as above, but framed\nplt_tracks(0, 150, 7, True, True)\n\n#%%\n\n# draw all tracks (notice the tendency towards the right, specially low corner)\nplt_tracks(0, Ntracks, 7, True, True)\n\n#%%\n\n# the scatter plot:\nplt.figure(figsize=(4, 4), dpi=140)\nplt.xlabel(r'$x/\\sigma$')\nplt.ylabel(r'$y/\\sigma$')\nplt.xlim(252, 1000)\nplt.ylim(60, 650)\nplt.hist2d(tracks_table[ishort]['x'],\n tracks_table[ishort]['y'], bins=350, cmap='gray')\nplt.clim(-10, 100)\n\n#%%\n\nplt.savefig('hist2d_N60_ap30.jpeg', bbox_inches='tight', quality=100)\n\n\n# ## Static structure functions\n\n# ## PAIR CORRELATION FUNCTION _g(r)_\n\n#%%\n\n6.67*1.e-11*0.11*5.97*1.e24/(3393.5*1.e3)**2\n\n#%%\n\n(2*np.pi)**2*0.7/1.68**2\n\n#%%\n\n2.*mp.pi*(0.7/3.80)**0.5\n\n#%%\n\n# TRUE RADIAL DISTRIBUTION FUNCTION, simple version, no corner corrections\n\ndef gr(tabla, nrbin, LX, LY):\n global dr, hist, edges\n # The number of g(r) measurements is the binomial cofficient\n # since the points order in pairs does not matter; i.e., g(ri-rj)=g(rj-ri)\n npart = len(tabla)\n nr_measures = int(spec.binom(npart, 2))\n # print('measures per bin: ', nr_measures/nrbin) # print no. of measuremnts per bin\n # calculate the bin size, relative to particle radii\n drr = np.sqrt(LX**2 + LY**2)/nrbin\n # initialize array of radial distr function distances array and bin index\n dr = np.zeros(nr_measures)\n ii = 0\n # Loop over all statistically relevant particle pairs\n # calculating distances\n for i in range(npart-1):\n if (i % 100 == 0 and i > 0):\n printp(str(i) + ' particles performed')\n for j in range(i+1, npart):\n dr[ii] = (tabla.x[i]-tabla.x[j])**2 + (tabla.y[i] - tabla.y[j])**2\n if dr[ii] < 78:\n print('careful\\n')\n ii = ii + 1\n dr = np.sqrt(dr)\n # count number of measured distances per bin; i.e., get the g(r)\n hist, edges = np.histogram(dr, bins=nrbin)\n # create array of bin centers\n r = [edges[i] - 0.5*drr for i in range(len(hist))]\n # normalize the radial distribution function\n g = [(LX*LY/npart)*hist[i]/(2*np.pi*drr*r[i]) for i in range(len(hist))]\n #r = r[:-1]\n return r, g\n\n#%%\n\nr, g = gr(state(50, datos_250), 200, 750, 600)\n\n#%%\n\nNt = int(np.max(datos_250.frame))\ngdr = [np.zeros(100) for i in range(Nt)]\n\nfor i in range(1, Nt):\n if (i % 100 == 0 and i > 0):\n printp(str(i) + ' states performed out of '+str(Nt))\n r, gdr[i] = gr(state(i, datos_250), 100, 750, 600)\n\n#%%\n\ng = np.array(gdr[1])\nfor i in range(2, Nt):\n g = g + np.array(gdr[i])\ng = g/(1.0*Nt)\n\n#%%\n\nplt.plot(r, g)\n\n#%%\n\nplt.plot(r[:-1], np.array(g), 's', markersize=4)\n\n\n# ## VELOCITY AUTOCORRELATIONS\n\n#%%\n\n# VELOCITY AUTOCORRELATION functions\n\n# correlations for a track\ndef vel_autocorr_track(tabla):\n lt = len(tabla)\n vacx = np.empty(lt-1)\n vacy = np.empty(lt-1)\n tvc = np.empty(lt-1)\n for i in range(1, lt-1):\n tvc[i-1] = i\n vacx[i-1] = np.dot(tabla.vx[:-i], tabla.vx[i:])/(lt-i)\n vacy[i-1] = np.dot(tabla.vy[:-i], tabla.vy[i:])/(lt-i)\n return tvc, vacx, vacy\n\n# correlations average for all kept tracks\n\n\ndef vel_autocorr(tabla):\n av_vacx = np.zeros(Nframes-1)\n av_vacy = np.zeros(Nframes-1)\n no_measurmnts = np.zeros(Nframes, dtype=int)\n for i in range(Ntracks):\n tvc, vacx, vacy = vel_autocorr_track(tracks[i])\n av_vacx[:tr_lengths[i]-1] = av_vacx[:tr_lengths[i]-1] + vacx\n av_vacy[:tr_lengths[i]-1] = av_vacy[:tr_lengths[i]-1] + vacy\n no_measurmnts[:tr_lengths[i]] = no_measurmnts[:tr_lengths[i]] + 1\n printp('track ' + str(i) + ' of ' + str(Ntracks))\n for i in range(Nframes-1):\n av_vacx[i] = av_vacx[i]/no_measurmnts[i]\n av_vacy[i] = av_vacy[i]/no_measurmnts[i]\n return av_vacx, av_vacy\n\n#%%\n\ntracks = all_tracks(datos_1000)\n\n#%%\n\ntvc, vacx, vacy = vel_autocorr_track(tracks[1])\n\n#%%\n\n# track10 , datos_2000\nfig = plt.figure(1)\npx_size = 72./fig.dpi\nplt.xlim(-10, 1000)\nplt.ylim(-0.1, 0.3)\nplt.scatter(tvc, vacx, marker='.', color='red', s=px_size)\nplt.scatter(tvc, vacy, marker='.', color='blue', s=px_size)\n\n\n# ## BASE $XY$ AND VORONOI FUNCTIONS\n\n#%%\n\nLY\n\n#%%\n\nss100 = state(100, tracks_table[ishort])\n\n#%%\n\nlen(ss100)\n\n#%%\n\n# The 3 previous lines allow for precise input of LX (if LX=LY)\nLX = LX/info.particle_diameter_px[0]\nLY = LY/info.particle_diameter_px[0]\n\n#%%\n\nxy = ss100[['x', 'y']]\nnpart = len(ss100)\n\nX_off = 0.05 # this is the width of the border layer to discard\nY_off = X_off*0.5", "target_code": "vor = Voronoi(xy)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Processing of particle tracks\n# ### coded to work with experiments and simulation\n\n# ## _Import of python libraries used within the code_\n#\n# ### load both code cells \n\n\n# HI HELLO\nimport ipywidgets as widgets # import\nfrom IPython.display import display, HTML # screen on display in HTML library\nimport matplotlib.colors as mcolors\nfrom matplotlib.collections import PolyCollection\nfrom matplotlib import rc\nfrom scipy import signal # noise filtering functions\nimport time # timing functions (for computation time)\nfrom collections import Counter # (not used)\nimport scipy.optimize as optimize # optimazation library (not used)\nfrom scipy.spatial import Voronoi, voronoi_plot_2d # import Voronoi functions\nimport scipy.special as spec\nimport scipy.stats as st\nimport scipy.constants as ct # import mathematical constants\n# double (or higher) floating point precision library (not used)\nimport mpmath as mp\nfrom sympy import var # symbolic calculation library variables\nimport sympy as sp # symbolic calculation library functions\nimport json\nimport sys\nimport subprocess # these 2 lines below load OS functionality\nimport matplotlib.pyplot as plt # plot library\nimport numpy as np # fast array library\nimport pandas as pd # data frames library\n# plot display type. use this to keep graphs within the notebook\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\nglobal iscaled\niscaled = 0\n\nglobal LX, LY\nLX = 1000 - 250\nLY = 650-50\n\n# system size for a given (npart, rho) configuration\n\n\ndef L(npart, rho):\n l = np.sqrt(npart*2*np.sqrt(3.)/rho)\n return l\n\n# print progress function\n# more elegant (python 3 alternative): print('\\rhello', end='',flush=True)\n\n\ndef printp(string):\n sys.stdout.write('\\r' + str(string))\n sys.stdout.flush()\n\n#from __future__ import print_function\n\n\n# Text format for figures configuration (lines below)\nrc('text', usetex=True)\nplt.rcParams['font.size'] = 24\n# special plotting functions\n\n\ndef set_up_graf(idf, LX, LY):\n global ax, fig_system, ss\n\n fig_system = plt.figure(idf, figsize=(6.5, 6.5*LY/LX))\n ss = (72./fig_system.dpi)**2 # particle size\n ax = fig_system.add_subplot(1, 1, 1)\n ax.set_ylim([0, LY])\n ax.set_xlim([0, LX])\n ax.set_xlabel(r'$x/\\sigma$', fontsize=24)\n ax.set_ylabel(r'$y/\\sigma$', fontsize=24)\n\n\ndef progreso(val, vmax):\n\n a = widgets.IntProgress(\n value=val,\n min=0,\n max=vmax,\n step=1,\n description='Loading:',\n bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n orientation='horizontal'\n )\n return a\n\n\nprogreso(2, 104)\n\n\n# ## _Read tracking data _\n#\n# We explain the action of each of these functions below:\n#\n# __get_info(hash_prefix) :__ Reads and prints info file with _hash prefix_ hash code. See output table in RUN CELL 0 below to grasp the info structure\n#\n# __simple_pickle_read(hash_prefix) :__\n# This 'simple_pickle_read' takes a pkl.xz file _hash prefix_ hash code, and which contains xy positions and tracks (no velocities) and stores in 'tabla' pandas data frame.\n#\n# * input: hash_prefix\n# * output: global variable _Ntracks_ (number of tracks)\n\n\ndef pdisplay(info):\n display(HTML(info.to_html()))\n\n# This function reads pickle binary file (.pkl) with trajectories\n\n\ndef get_info(series_directory, hash_prefix):\n global info\n nombre = './Datos/' + series_directory + \\\n '/info/' + hash_prefix + '_experiment_info.txt'\n with open(nombre) as f:\n jsonstr = json.load(f)\n info = pd.json_normalize(jsonstr)\n pdisplay(info.T)\n return info\n\n\ndef simple_pickle_read(series_directory, hash_prefix):\n global Nframes, Ntracks\n # Read table in pickle format\n nombre = './Datos/' + series_directory + '/tracks/' + \\\n hash_prefix + '_rect_roi_trajectories.pkl.xz'\n tabla = pd.read_pickle(nombre, compression='infer')\n # por si los indices salen desordenados\n tabla = tabla.reset_index(drop=True)\n Nframes = np.max(tabla.frame)+1\n Ntracks = np.max(tabla.track)+1\n print('Number of frames in the movie: ', Nframes)\n print('Maximum track index: ', Ntracks)\n print('\\nFirst 5 rows of imported table:\\n')\n display(HTML(tabla.head().to_html()))\n tabla = tabla.sort_values(by=['frame', 'track']).reset_index(drop=True)\n return tabla\n\n\n# ## _Order experiment series by increasing packing fraction_\n# _(and for constant packing fraction, order by increasing temperature) _\n\n\n# Read the pickle format table of experiments series\nlista_experimentos = pd.read_pickle(\n './Datos/info_table.pkl', compression='infer')\n\n# order experiments series table by packing fraction (primary) and temperature (secondary)\nlista_experimentos_ordenada = lista_experimentos.sort_values(\n by=['packing_fraction', 'temperature'])\n\n# Display the complete ordered table; HTML format\n# display(HTML(\n# lista_experimentos_ordenada.to_html()\n# ))\n\n# Export pkl table to Excel\n# lista_experimentos_ordenada.to_excel('info_table.xlsx')\n\n\nlista_inf = lista_experimentos_ordenada[lista_experimentos_ordenada.packing_fraction > 0.69]\n\n\nlista_save = lista_inf[lista_inf.packing_fraction < 0.99]['experiment_id']\n\n\nlista_save.to_csv('rho4.dat', index=False)\n\n\nlista_save.to_csv('rho1.dat', index=False)\n\n\n# ## _Reduce global table - functions_\n#\n# __reset_track_indexes(tabla0):__\n# Eliminates 'lonely' tracks (particles tracked for just 1 frame), re-indexes so that no track indexes are empty\n# * input: _tabla0_ original data frame (of tracks)\n# * output: RETURNS clean _tabla_ data frame, without empty tracks; AND original _tabla0_ with empty tracks still included\n# * output structure: _tabla0_, _tabla_\n#\n# __short_drop:(ishort, tabla):__\n# Eliminates tracks detected for n frames or less, re-indexes so that no track indexes are empty\n#\n# * Input: _ishort_ (threshold length of tracks: shorter tracks are not kept), _tabla_ (original tracks data frame)\n#\n# * Output: RETURNS _tabla_\\__short_ (frame), that is the input table _tabla_ without tracks shorter than _ishort_\n#\n\n\ndef reset_track_indexes(tabla0):\n global Ntracks, Nframes\n \"\"\" This function takes a dataframe in which some trajectory indexes\n are missing (maybe due to having deleted short trajectories) and\n resets indexes so that we can loop over the tracks with 'range' \"\"\"\n # 'real_number_of_tracks' should be <= than 'current_last_particle_index'\n tabla = tabla0.copy()\n Ntracks = len(set(tabla.track))\n original_indexes = np.sort(list(set(tabla.track)))\n unsort_indexes = original_indexes\n fixed_indexes = np.arange(0, Ntracks, step=1, dtype=int)\n if (original_indexes == fixed_indexes).all() == False: # fix only if there are empty tracks\n # With these two lists we create a dictionary and map old values to new ones\n n_empty = np.max(tabla.track) - Ntracks\n replacement_dict = dict(zip(original_indexes, fixed_indexes))\n tabla.track = tabla.track.map(\n dict(zip(original_indexes, fixed_indexes)))\n print('no. of empty track indexes discarded: ', n_empty, '\\n')\n else:\n print('nothing to fix\\n')\n Ntracks = np.max(tabla.track)+1\n Nframes = np.max(tabla.frame)+1\n return tabla0, tabla\n\n# INPUT\n# ishort: number of minimum frames in a track (eliminates tracks under ishort time length)\n# tabla: pandas Data Frame to shorten\n# OUTPUT\n\n\ndef short_drop(ishort, tabla):\n global shorts_list, Ntracks\n shorts_list = []\n Ntracks = np.max(tabla.track)+1\n for i in range(Ntracks):\n t1 = track(i, tabla, False)\n if len(t1) < ishort+1:\n shorts_list.append(i)\n len0 = len(tabla)\n tabla = tabla.drop(t1['index'])\n texto = 'dropped track no. ' + \\\n str(i)+'; data table length decreased in '+str(len(tabla)-len0)\n printp(texto)\n print('\\n')\n tabla_short = tabla.sort_values(\n by=['frame', 'track']).reset_index(drop=True)\n Nshorts = np.max(tabla_short.track)+1\n printp('Dropped out ' + str(Ntracks-Nshorts) +\n ' short tracks out of ' + str(Ntracks))\n # the line above is necessary so that eliminates index voids and shuffling after short drop\n print('\\n')\n return tabla_short\n\n\n# ## _Get tracks and states from global table - functions_\n#\n# __track(t_id, tabla, dropit) :__ builds track for one particle from the appropriate chunk of the source table, with only the lines for particle _t_\\__id_\n#\n# * Input: _t_\\__id_ indice de track, _tabla_ source frame fuente (contains tracks), _dropit_ boolean; if =True then erases old index column (tracks do not always begin in frame 0); most of the time you just want _dropit_=True\n#\n# * Output: RETURNS a chunk of source table _tabla_ with only the lines for particle _t_\\__id_\n#\n# __all\\___ __tracks(tabla, dropit):__ repeats the process in _track_ function for all existing particles\n#\n# * Input: _tabla_ (frame) data source, _dropit_ boolean variable set True to erase original table original line number\n#\n# * Output: RETURNS _track_ array of frames each wiith one particle track; builds _tr_\\__lengths_ array of tracks lengths\n#\n#\n# __state(it, tabla):__ gets one instantaneous state frome source frame _tabla_, at frame _it_\n#\n# * Input: _it_ frame no. to get the state from; _tabla_ source frame data\n#\n# * Output: RETURNS st frame chunk from _tabla_ source frame\n#\n# __all__\\___states(tabla):__ Builds all instantaneous states from the movie\n#\n# * Input: _tabla_ source frame data\n#\n# * Output: RETURNS array of frame chunks from _tabla_ source frame. each chunk being an instantaneous state\n\n\n##### INDIVIDUAL TRACKS ##############\n# build a 't_id' indivitual track\ndef track(t_id, tabla, dropit):\n t1 = tabla.loc[tabla.track == t_id].reset_index(drop=dropit)\n return t1\n\n# build individual tracks from all kept tracks\n# OUTPUT\n# tr_lengths[i]: length of track no. 'i'. The total no. of tracks is stored in 'Ntracks'\n\n\ndef all_tracks(tabla, dropit):\n # length of track\n global tr_lengths\n tr_lengths = np.empty(Ntracks, dtype=int)\n tracks = [[] for i in range(Ntracks)]\n for i in range(Ntracks):\n tracks[i] = track(i, tabla, dropit)\n tr_lengths[i] = int(len(tracks[i]))\n return tracks\n\n##### INSTANTANEOUS STATES ##############\n# BUILD INSTANTANEOUS STATES OF THE SYSTEM\n\n\ndef state(it, tabla):\n st = tabla.loc[tabla.frame == it].reset_index(drop=True)\n # reset row index\n # (otherwise keeps chunked index of the original table)\n return st\n\n# build instantaneous states over all frames\n\n\ndef all_states(tabla):\n sts = [[] for i in range(Nframes)]\n for i in range(Nframes):\n sts[i] = state(i, tabla)\n return sts\n\n\n# ## _Low-Pass filters_\n#\n# __butter_lowpass(step,fps, arr) :__ Applies Butterworth low-pass filter to _arr_ array, with averaging width _step_, for a movie with _fps_ frame rate; _fr_ is the fraction of the maximum frequency that is allowed to pass\n#\n# * Input: _fr_, _step_, _fps_, _arr_; as described above\n# * Output: RETURNS an array of the same size as _arr_\n#\n# __cheby1_lowpass(step,fps, arr) :__ Applies Chebyshev type I low-pass filter to _arr_ array, with averaging width _step_, for a movie with _fps_ frame rate\n#\n# * Input: _step_, _fps_, _arr_; as described above\n# * Output: RETURNS an array of the same size as _arr_\n#\n#\n\n\ndef butter_lowpass(fr, step, fps, arr):\n\n #N, Wn = signal.buttord(1./(1.*fps/step),1./fps, 1/step, fps*0.5)\n N, Wn = signal.buttord(fr/step, fr, 1/step, fps*0.5, 0.5/fps)\n b, a = signal.butter(N, Wn, 'low')\n yy = signal.filtfilt(b, a, np.squeeze(arr), padtype=None)\n return yy\n\n\ndef filter_tracks_butter(fr, step, fps, tabla):\n tabla_unfiltered = tabla.copy()\n for i in range(Ntracks):\n printp('Filtering positions for track no. ' +\n str(i+1) + ' of ' + str(Ntracks))\n xbb = np.array(tabla.loc[tabla['track'] == i, 'x'])\n xb = butter_lowpass(fr, step, info.fps[0], xbb)\n tabla.loc[tabla['track'] == i, 'x'] = xb\n ybb = tabla.loc[tabla['track'] == i, 'y']\n yb = butter_lowpass(fr, step, info.fps[0], ybb)\n tabla.loc[tabla['track'] == i, 'y'] = yb\n print('\\n')\n return tabla_unfiltered, tabla\n\n# def cheby1_lowpass(step,fps,arr):\n# N, Wn = signal.cheb1ord(1./(1.*fps/step),1./fps, step, fps*0.5)\n# b, a = signal.cheby1(N, 1./fps, Wn, 'low')\n# y = signal.filtfilt(b, a, arr)\n# return y\n\n\n# ## _Get velocities (differences) and accelerations (differences of differences) from individual tracks_\n#\n# __vels(tabla) :__ Ouputs velocities from tracks table; by decomposing into individual trajectories first\n#\n# * Input: _tabla_ frame data source\n#\n# * Output: RETURNS _tabla_ with 2 new columns (_tabla['vx']_, _tabla['vy']_)\n\n\ndef diffs(str01, str02, str1, str2, tabla):\n global last_id\n tabla[str1] = np.zeros(len(tabla))\n tabla[str2] = np.zeros(len(tabla))\n last_id = np.zeros(Ntracks, dtype=int)\n for k in range(Ntracks):\n printp('diffs: ' + str(k+1) + ' of ' + str(Ntracks) + ' tracks')\n t1 = track(k, tabla, False)\n tabla.loc[t1['index'][:-1], str1] = np.diff(t1[str01])\n tabla.loc[t1['index'][:-1], str2] = np.diff(t1[str02])\n last_id[k] = tabla.loc[tabla['track'] == k].index.values[-1:]\n tabla = tabla.drop(last_id)\n print('\\n')\n return tabla\n\n\n# ## Bring data to physical units\n#\n# We take as coordinate origin the mid-point of the max and min X and Y positions ever tracked within the region of interest (ROI) during the experiment.\n#\n# We have 2 options for lengh unit:\n# * milimeters\n# * Ball diameter\n#\n# The time scale unit is seconds.\n#\n# Only after this step position differences in the original data table are re-scaled as real velocities\n\n\n# PHYSICAL SCALES AND ORIGIN\n# Usage: re_pos_scale(0) for 1 mm as length unit; re_pos_scale(1) for ball diameter (sigma)\n# length unit\n\ndef set_origin(shiftx, shifty, tabla):\n tabla_not_shifted = tabla.copy()\n tabla['x'] -= shiftx\n tabla['y'] -= shifty\n return tabla_not_shifted, tabla\n\n\ndef scale(l_unit, t_unit, tabla):\n global iscaled\n tabla_not_scaled = tabla.copy()\n if iscaled == 0:\n inv_l_unit = 1./l_unit\n print(inv_l_unit)\n if np.any(tabla.columns == 'vx') or np.any(tabla.columns == 'vy'):\n print(inv_l_unit)\n tabla[['x', 'y']] *= inv_l_unit\n tabla[['vx', 'vy']] *= (inv_l_unit * t_unit)\n if np.any(tabla.columns == 'ax') or np.any(tabla.columns == 'ay'):\n tabla[['ax', 'ay']] *= (inv_l_unit * t_unit**2)\n else:\n tabla[['x', 'y']] *= inv_l_unit\n iscaled = 1\n else:\n tabla[['x', 'y']] *= 1\n iscaled = 0\n return tabla_not_scaled, tabla\n\n\n# ## RUN CELL 0\n# ### - Import pkl.xz tracks table\n# ### - create all individual tracks arrays\n# ### - analyze track length histogram, decide minimum track length\n\n\n# COMPLETE SET OF READING INSTRUCTIONS\n\nseries_directory = 'ppp/rho_020_049'\nhash_prefix = '0f9e94e168aa137db93f8b064448c7e2'\n\ninfo = get_info(series_directory, hash_prefix)\n\ndatos = simple_pickle_read(series_directory, hash_prefix)\ndatos_orig, datos = reset_track_indexes(datos)\n\n# Build all tracks, storing them individually\ntracks = all_tracks(datos, True)\nprint('\\n min for positions:\\n', np.min(datos[['x', 'y']]), '\\n')\nprint('max. for positions:\\n', np.max(datos[['x', 'y']]), '\\n')\n\n\n# Histogram of trajectory length\nplt.figure(1, figsize=(8, 8/ct.golden))\nplt.title('Trajectory lengths')\n# set upper height limit for histogram\ntrack_lengths = [len(tracks[i]) for i in range(Ntracks)]\nplt.ylim(0, 80)\n# number of bins for tr4ack length histogram\nnbins = 200\n# plot\nplt.ylim(0, 50)\nfig = plt.hist(track_lengths, nbins, color='b', range=(0, 200))\n\n\n# ## RUN CELL 1\n# ### - eliminate short tracks\n# ### - obtain position differences (vels)\n# ### - obtain velocities differences (accels)\n# ### - create all (non-short) individual tracks arrays. (minimum theoretical length: 5)\n\n\ntracks_table = [[] for i in range(info.n_frames[0])]\n\n\n# discard short tracks\nishort = 10 # minimum length of kept tracks\ntracks_table[ishort] = short_drop(ishort, datos)\ntracks_orig, tracks_table[ishort] = reset_track_indexes(tracks_table[ishort])\niscaled = 0\n\n# (butterworth)-filter particle positions\ntracks_unfiltered, tracks_table[ishort] = filter_tracks_butter(\n 0.8, 4, info.fps[0], tracks_table[ishort])\n\n# calculate velocities for all tracks\n#tracks_table[ishort] = diffs('x','y','vx','vy', tracks_table[ishort]);\n\n# calculate accelerations for all tracks\n#tracks_table[ishort] = diffs('vx','vy','ax','ay', tracks_table[ishort]);\n\n# print table head\nprint('not scaled: \\n')\ndisplay(HTML(tracks_table[ishort].head().to_html()))\n\n\n# scale and re-position the system (left-bottom corner is (0,0))\niscaled = 0\nnot_shifted, tracks_table[ishort] = set_origin(\n np.min(datos.x), np.min(datos.y), tracks_table[ishort]) # re-position\n\nnot_scaled, tracks_table[ishort] = scale(\n info.particle_diameter_px[0], 10**3/info.fps[0], tracks_table[ishort]) # scale\n\n# print table head\nprint('scaled: \\n')\ndisplay(HTML(tracks_table[ishort].head().to_html()))\n\n\n# store tracks individually\n#tracks = all_tracks(tracks_table[ishort], True)\n#\n# store system (instantaneous) states individually\n#states = all_states(tracks_table[ishort])\n\n\ntracks_table[ishort].tail()\n\n\n# ## _Plotting functions_\n#\n# __frameit(ax):__ frames a figure by drawing the image limits and the ROI limits as well\n#\n# __plt__\\___track(t_id,xs,tagit,fr\\_it):__ plots just one track (_t_\\__id_) within _fr_\\__it_ (if True) frames, with size _xs_ and prints the track no. if _tagit_ is True\n#\n# __plt__\\___tracks(init_id, final_id,xs,tagit,fr\\_it):__ plots _init_\\__id_ to _final_\\__id_ tracks within _fr_\\__it_ (if True) frames, with size _xs_ and prints the track no. if _tagit_ is True\n\n\ndef frameit(ax):\n ax.set_xlim(0, info['shape'][0][0])\n ax.set_ylim(0, info['shape'][0][1])\n rect = plt.Rectangle([250, 50], 750, 600, alpha=1,\n lw=10, fill=False, edgecolor='b')\n ax.add_artist(rect)\n\n# Plot just one track function (tagit?, frameit?)\n\n\ndef plt_track(t_id, xs, tagit, fr_it):\n if fr_it == True:\n fig, ax = plt.subplots(\n figsize=(xs, xs*info['shape'][0][1]/info['shape'][0][0]))\n px_size = 72./fig.dpi\n frameit(ax)\n else:\n fig, ax = plt.subplots()\n px_size = 72./fig.dpi\n\n plt.plot(tracks[t_id].x, tracks[t_id].y, '.',\n c='r', markersize=px_size, linewidth=None)\n if tagit == True:\n plt.text(np.mean(tracks[t_id].x), np.mean(tracks[t_id].y), str(t_id))\n\n\ndef plt_tracks(init_id, final_id, xs, tagit, fr_it):\n if fr_it == True:\n fig, ax = plt.subplots(\n figsize=(xs, xs*info['shape'][0][1]/info['shape'][0][0]))\n px_size = 72./fig.dpi\n frameit(ax)\n else:\n fig, ax = plt.subplots()\n px_size = 72./fig.dpi\n for i in range(init_id, final_id):\n plt.plot(tracks[i].x, tracks[i].y, '.', c='r',\n markersize=px_size, linewidth=None)\n if tagit == True:\n plt.text(np.mean(tracks[i].x), np.mean(tracks[i].y), str(i))\n\n\n# the scatter plot:\nset_up_graf(15, LX, LY)\nplt.hist2d(tracks_table[ishort]['x'],\n tracks_table[ishort]['y'], bins=350, cmap='gray')\nplt.clim(-10, 100)\n\n#plt.savefig('hist2d_N60_ap30.jpeg', bbox_inches='tight', quality=100)\n\nss100 = state(100, tracks_table[ishort])\nxy = ss100[['x', 'y']]\nnpart = len(ss100)\n\nX_off = 0.09 # this is the width of the border layer to discard\nY_off = X_off*0.5\n\n# Make Voronoi graph out of the last particle positions\nvor = Voronoi(xy)\n# Eliminate outer cells\ninner = inner_cells(vor, LX=LX, X_off=X_off, LY=LY, Y_off=Y_off)\n# Represent particle positions and inner Voronoi cells\nset_up_graf(16, LX, LY)\ngraf = graf_vor(1, 1, inner, LX, LY)\n\n#plt.savefig('voronoi_N160_ap30.jpeg', bbox_inches='tight', quality=100)\n\n\nplt.hist(np.abs(tracks_table[ishort].vx), 100, color='b')\n\n\nplt.hist(np.abs(tracks_table[ishort].vy), 100, color='b')\n\n\n# ## _Average in 'grains'_\n#\n# __grain(arr, step) :__ Builds coarse-grained trajectories. Designed for noisy trajectories.\n#\n# * Input: _arr_ array of data to coarse grain; _step_ the coarse-grain size (in time intervals)\n# * Output: RETURNS a data array\n\n\ndef grain(arr, step):\n invdt = 1./step\n n_arr = len(arr)\n gr_arr = [[i + 0.5*step, (arr[i+step]-arr[i])*invdt]\n for i in range(0, n_arr-step, step)]\n return np.array(gr_arr)\n\n\nNtracks\n\n\n# plot just one track, tagit, dont frame it\nplt_track(100, 7, True, False)\n\n\n# plot just one track, dont tagit, frame it\nplt_tracks(0, Ntracks, 7, False, True)\n\n\n# plot just one track, dont tagit, frame it\nplt_tracks(0, Ntracks, 7, False, True)\n\n\n# plot first 40 tracks, tag'em, dont frame them\nplt_tracks(0, 40, 7, True, False)\n\n\n# same as above, but framed\nplt_tracks(0, 150, 7, True, True)\n\n\n# draw all tracks (notice the tendency towards the right, specially low corner)\nplt_tracks(0, Ntracks, 7, True, True)\n\n\n# the scatter plot:\nplt.figure(figsize=(4, 4), dpi=140)\nplt.xlabel(r'$x/\\sigma$')\nplt.ylabel(r'$y/\\sigma$')\nplt.xlim(252, 1000)\nplt.ylim(60, 650)\nplt.hist2d(tracks_table[ishort]['x'],\n tracks_table[ishort]['y'], bins=350, cmap='gray')\nplt.clim(-10, 100)\n\n\nplt.savefig('hist2d_N60_ap30.jpeg', bbox_inches='tight', quality=100)\n\n\n# ## Static structure functions\n\n# ## PAIR CORRELATION FUNCTION _g(r)_\n\n\n6.67*1.e-11*0.11*5.97*1.e24/(3393.5*1.e3)**2\n\n\n(2*np.pi)**2*0.7/1.68**2\n\n\n2.*mp.pi*(0.7/3.80)**0.5\n\n\n# TRUE RADIAL DISTRIBUTION FUNCTION, simple version, no corner corrections\n\ndef gr(tabla, nrbin, LX, LY):\n global dr, hist, edges\n # The number of g(r) measurements is the binomial cofficient\n # since the points order in pairs does not matter; i.e., g(ri-rj)=g(rj-ri)\n npart = len(tabla)\n nr_measures = int(spec.binom(npart, 2))\n # print('measures per bin: ', nr_measures/nrbin) # print no. of measuremnts per bin\n # calculate the bin size, relative to particle radii\n drr = np.sqrt(LX**2 + LY**2)/nrbin\n # initialize array of radial distr function distances array and bin index\n dr = np.zeros(nr_measures)\n ii = 0\n # Loop over all statistically relevant particle pairs\n # calculating distances\n for i in range(npart-1):\n if (i % 100 == 0 and i > 0):\n printp(str(i) + ' particles performed')\n for j in range(i+1, npart):\n dr[ii] = (tabla.x[i]-tabla.x[j])**2 + (tabla.y[i] - tabla.y[j])**2\n if dr[ii] < 78:\n print('careful\\n')\n ii = ii + 1\n dr = np.sqrt(dr)\n # count number of measured distances per bin; i.e., get the g(r)\n hist, edges = np.histogram(dr, bins=nrbin)\n # create array of bin centers\n r = [edges[i] - 0.5*drr for i in range(len(hist))]\n # normalize the radial distribution function\n g = [(LX*LY/npart)*hist[i]/(2*np.pi*drr*r[i]) for i in range(len(hist))]\n #r = r[:-1]\n return r, g\n\n\nr, g = gr(state(50, datos_250), 200, 750, 600)\n\n\nNt = int(np.max(datos_250.frame))\ngdr = [np.zeros(100) for i in range(Nt)]\n\nfor i in range(1, Nt):\n if (i % 100 == 0 and i > 0):\n printp(str(i) + ' states performed out of '+str(Nt))\n r, gdr[i] = gr(state(i, datos_250), 100, 750, 600)\n\n\ng = np.array(gdr[1])\nfor i in range(2, Nt):\n g = g + np.array(gdr[i])\ng = g/(1.0*Nt)\n\n\nplt.plot(r, g)\n\n\nplt.plot(r[:-1], np.array(g), 's', markersize=4)\n\n\n# ## VELOCITY AUTOCORRELATIONS\n\n\n# VELOCITY AUTOCORRELATION functions\n\n# correlations for a track\ndef vel_autocorr_track(tabla):\n lt = len(tabla)\n vacx = np.empty(lt-1)\n vacy = np.empty(lt-1)\n tvc = np.empty(lt-1)\n for i in range(1, lt-1):\n tvc[i-1] = i\n vacx[i-1] = np.dot(tabla.vx[:-i], tabla.vx[i:])/(lt-i)\n vacy[i-1] = np.dot(tabla.vy[:-i], tabla.vy[i:])/(lt-i)\n return tvc, vacx, vacy\n\n# correlations average for all kept tracks\n\n\ndef vel_autocorr(tabla):\n av_vacx = np.zeros(Nframes-1)\n av_vacy = np.zeros(Nframes-1)\n no_measurmnts = np.zeros(Nframes, dtype=int)\n for i in range(Ntracks):\n tvc, vacx, vacy = vel_autocorr_track(tracks[i])\n av_vacx[:tr_lengths[i]-1] = av_vacx[:tr_lengths[i]-1] + vacx\n av_vacy[:tr_lengths[i]-1] = av_vacy[:tr_lengths[i]-1] + vacy\n no_measurmnts[:tr_lengths[i]] = no_measurmnts[:tr_lengths[i]] + 1\n printp('track ' + str(i) + ' of ' + str(Ntracks))\n for i in range(Nframes-1):\n av_vacx[i] = av_vacx[i]/no_measurmnts[i]\n av_vacy[i] = av_vacy[i]/no_measurmnts[i]\n return av_vacx, av_vacy\n\n\ntracks = all_tracks(datos_1000)\n\n\ntvc, vacx, vacy = vel_autocorr_track(tracks[1])\n\n\n# track10 , datos_2000\nfig = plt.figure(1)\npx_size = 72./fig.dpi\nplt.xlim(-10, 1000)\nplt.ylim(-0.1, 0.3)\nplt.scatter(tvc, vacx, marker='.', color='red', s=px_size)\nplt.scatter(tvc, vacy, marker='.', color='blue', s=px_size)\n\n\n# ## BASE $XY$ AND VORONOI FUNCTIONS\n\n\nLY\n\n\nss100 = state(100, tracks_table[ishort])\n\n\nlen(ss100)\n\n\n# The 3 previous lines allow for precise input of LX (if LX=LY)\nLX = LX/info.particle_diameter_px[0]\nLY = LY/info.particle_diameter_px[0]\n\n\nxy = ss100[['x', 'y']]\nnpart = len(ss100)\n\nX_off = 0.05 # this is the width of the border layer to discard\nY_off = X_off*0.5\n", "project_metadata": {"full_name": "fvegar/Tracks", "description": "Data analysis from experiments and molecular dynamics simulation", "topics": [], "git_url": "git://github.com/fvegar/Tracks.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2019-03-02T18:46:38Z", "size": 19326, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3661750, "Python": 138128, "Mathematica": 61934}, "last_updated": "2020-12-02T07:50:07Z"}, "intent": "# Make Voronoi graph out of the last particle positions"}, {"original_comment": "# 80% confidence interval\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom scipy.stats import gaussian_kde\nimport datetime\nfrom scipy.stats import norm\nfrom pandas_datareader import data as pdr\nimport quandl\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n#%%\n\n\n\n#%%\n\n\n\n#%%\n\nbitcoin = quandl.get(\"BCHAIN/MKPRU\")\n\n#%%\n\nbitcoin.columns = ['Close']\nbitcoin['Close'].replace(0, np.nan, inplace=True)\nbitcoin = bitcoin.dropna()\nbitcoin.Close.plot(logy=True)\nplt.legend()\n\n#%%\n\nbitcoin['Returns'] = (bitcoin['Close'].pct_change() + 1).fillna(1)\nbitcoin.head()\n\n#%%\n\nbitcoin.tail()\n\n#%%\n\n[x for x in bitcoin.Returns.tail(8) - 1]\n\n#%%\n\n# Trying to fit the returns to a normal distribution\n\n\nparameters = norm.fit(bitcoin.Returns - 1)\n\n# now, parameters[0] and parameters[1] are the mean and\n# the standard deviation of the fitted distribution\nx = np.linspace(min(bitcoin.Returns - 1), max(bitcoin.Returns - 1), 100)\n\n# Generate the pdf (fitted distribution)\nfitted_pdf = norm.pdf(x, loc=parameters[0], scale=parameters[1])\n# Generate the pdf (normal distribution non fitted)\nnormal_pdf = norm.pdf(x)\n\n# Type help(plot) for a ton of information on pyplot\nplt.plot(x, fitted_pdf, \"red\", label=\"Fitted normal dist\",\n linestyle=\"dashed\", linewidth=2)\n# plt.plot(x,normal_pdf,\"blue\",label=\"Normal dist\", linewidth=2)\nplt.hist(bitcoin.Returns - 1, normed=1, color=\"b\", alpha=.3, bins=200,\n label='Daily returns') # alpha, from 0 (transparent) to 1 (opaque)\nplt.title(\"Bitcoin returns and normal fitting\")\n# insert a legend in the plot (using label)\nplt.legend()\n# plt.yscale('log')\nplt.show()\n\n#%%\n\n\n\n#%%\n\ndatetime.date.today()\n\n#%%\n\nindex_future = pd.date_range(start=datetime.date.today(), end='2018-12-30')\nindex_future.shape\n\n#%%\n\nnp.random.seed(1234)\nsimulated_returns_bitcoin = np.random.choice(\n bitcoin.Returns, size=(len(index_future), 100000))\nsim_bitcoin_returns = pd.DataFrame(\n data=simulated_returns_bitcoin, index=index_future)\n# sim_bitcoin_returns\n\n#%%\n\ncum_sim_bitcoin = sim_bitcoin_returns.cumprod(axis=0)\ncum_sim_bitcoin.shape\n\n#%%\n\ncum_sim_bitcoin.iloc[:, :1000].plot(legend=False, logy=True)\n# plt.legend(False)\n\n#%%\n\nfuture = pd.DataFrame(data=cum_sim_bitcoin, index=index_future)\nfuture = future * bitcoin['Close'][-1]\n\n#%%\n\npossible_prices = future.iloc[-1, :]\npossible_prices.name = 'Possible price'\n\n#%%\n\nfuture.iloc[:, :200].plot(legend=False, logy=True, grid=True)\nyticks = plt.yticks()[0][1:-1]\nyticks = np.logspace(3, 6, 4)\nplt.yticks(yticks, ['{:,}'.format(t) for t in yticks])\nplt.title('Bitcoin price Monte Carlo simulations until Dec 31st 2018', size=16)\nplt.ylabel('Price ($)', size=12)\nplt.xlabel('Date', size=12)\n# plt.ylim(min(np.log10(future.iloc[:, :1000])), max(np.log10(future.iloc[:, :1000])))\n\n#%%\n\nfuture.iloc[:, :200].plot(legend=False, logy=False, grid=True)\nplt.title('Bitcoin price Monte Carlo simulations until Dec 31st 2018', size=16)\nplt.ylabel('Price ($)', size=12)\nplt.xlabel('Date', size=12)\n\n#%%\n\ny, x, _ = plt.hist(np.log(possible_prices), bins=200)\nxticks = plt.xticks()\nticks = np.linspace(min(np.log(possible_prices)),\n max(np.log(possible_prices)), 10)\nplt.xticks(ticks, [str(int(np.exp(tick))) for tick in ticks])\n\nplt.xlabel('Price ($)')\nplt.ylabel('Number of random walks')\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n#%%\n\n\n\n#%%\n\n# More logical ticks\n\n\n# Plot histogram of final simulated prices\nfig, ax = plt.subplots()\nax.hist(np.log(possible_prices), bins=150,\n density=True, label='Final simulated price')\nxticks = plt.xticks()\nhand_ticks = [100, 400, 1700, 6000, 24000,\n 90000, 340000, 1200000, 5000000, 18000000]\n# plt.xticks(np.log(hand_ticks), [str(tick) for tick in hand_ticks]);\nax.set_xticks(np.log(hand_ticks))\nax.set_xticklabels([str(tick) for tick in hand_ticks])\n\n# Plot KDE function\nkde = gaussian_kde(np.log(possible_prices))\nx = np.linspace(min(np.log(possible_prices)),\n max(np.log(possible_prices)), bins)\nax.plot(x, kde.pdf(x), linewidth=3, c='orange',\n alpha=1, label='KDE density function')\n\n# Plot vertical line at the most likely price\nmost_likely_price = np.exp(x[np.argmax(kde.pdf(x))])\n# most_likely_price = possible_prices.quantile(0.5)\nax.vlines(np.log(most_likely_price), 0, kde.pdf(\n np.log(most_likely_price)), color='w')\n\n# Draw annotation\nax.annotate('Most likely price: ${}'.format(int(round(most_likely_price))),\n xy=(np.log(most_likely_price), kde.pdf(np.log(most_likely_price))),\n xytext=(13, 0.3),\n arrowprops=dict(facecolor='black', shrink=0.0),\n size=14)\n\nplt.legend()\n\nplt.xlabel('Price ($) (log scale)', size=12)\nplt.ylabel('Density', size=12)\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n\nplt.show()\n\n#%%\n\n# More logical ticks\n\nfig, ax = plt.subplots()\nax.hist(np.log(possible_prices), bins=150, density=True)\nxticks = plt.xticks()\nhand_ticks = [100, 400, 1700, 6000, 24000,\n 90000, 340000, 1200000, 5000000, 18000000]\n# plt.xticks(np.log(hand_ticks), [str(tick) for tick in hand_ticks]);\nax.set_xticks(np.log(hand_ticks))\nax.set_xticklabels([str(tick) for tick in hand_ticks])\n\n# # Plot KDE function\n# kde = gaussian_kde(np.log(possible_prices))\n# x = np.linspace(min(np.log(possible_prices)), max(np.log(possible_prices)), bins)\n# ax.plot(x, kde.pdf(x), linewidth=2, c='orange', alpha=1)\n\n# # # Plot vertical line at the most likely price\n# most_likely_price = np.exp(x[np.argmax(kde.pdf(x))])\n# # most_likely_price = possible_prices.quantile(0.5)\n# ax.vlines(np.log(most_likely_price), 0, kde.pdf(np.log(most_likely_price)), color='w')\n\n# # Draw annotation\n# ax.annotate('Most likely price: ${}'.format(int(round(most_likely_price))),\n# xy=(np.log(most_likely_price), kde.pdf(np.log(most_likely_price))),\n# xytext=(13, 0.3),\n# arrowprops=dict(facecolor='black', shrink=0.0),\n# size=14)\n\n\nplt.xlabel('Price ($) (log scale)', size=12)\nplt.ylabel('Density', size=12)\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n\nplt.show()\n\n#%%\n\nmost_likely_price\n\n#%%\n\npossible_prices.quantile(0.5)\n\n#%%\n\npossible_prices.quantile(0.05)\n\n#%%\n\npossible_prices.quantile(0.95)\n\n#%%", "target_code": "possible_prices.quantile(.1), possible_prices.quantile(.9)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom scipy.stats import gaussian_kde\nimport datetime\nfrom scipy.stats import norm\nfrom pandas_datareader import data as pdr\nimport quandl\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n\n\n\n\n\n\nbitcoin = quandl.get(\"BCHAIN/MKPRU\")\n\n\nbitcoin.columns = ['Close']\nbitcoin['Close'].replace(0, np.nan, inplace=True)\nbitcoin = bitcoin.dropna()\nbitcoin.Close.plot(logy=True)\nplt.legend()\n\n\nbitcoin['Returns'] = (bitcoin['Close'].pct_change() + 1).fillna(1)\nbitcoin.head()\n\n\nbitcoin.tail()\n\n\n[x for x in bitcoin.Returns.tail(8) - 1]\n\n\n# Trying to fit the returns to a normal distribution\n\n\nparameters = norm.fit(bitcoin.Returns - 1)\n\n# now, parameters[0] and parameters[1] are the mean and\n# the standard deviation of the fitted distribution\nx = np.linspace(min(bitcoin.Returns - 1), max(bitcoin.Returns - 1), 100)\n\n# Generate the pdf (fitted distribution)\nfitted_pdf = norm.pdf(x, loc=parameters[0], scale=parameters[1])\n# Generate the pdf (normal distribution non fitted)\nnormal_pdf = norm.pdf(x)\n\n# Type help(plot) for a ton of information on pyplot\nplt.plot(x, fitted_pdf, \"red\", label=\"Fitted normal dist\",\n linestyle=\"dashed\", linewidth=2)\n# plt.plot(x,normal_pdf,\"blue\",label=\"Normal dist\", linewidth=2)\nplt.hist(bitcoin.Returns - 1, normed=1, color=\"b\", alpha=.3, bins=200,\n label='Daily returns') # alpha, from 0 (transparent) to 1 (opaque)\nplt.title(\"Bitcoin returns and normal fitting\")\n# insert a legend in the plot (using label)\nplt.legend()\n# plt.yscale('log')\nplt.show()\n\n\n\n\n\ndatetime.date.today()\n\n\nindex_future = pd.date_range(start=datetime.date.today(), end='2018-12-30')\nindex_future.shape\n\n\nnp.random.seed(1234)\nsimulated_returns_bitcoin = np.random.choice(\n bitcoin.Returns, size=(len(index_future), 100000))\nsim_bitcoin_returns = pd.DataFrame(\n data=simulated_returns_bitcoin, index=index_future)\n# sim_bitcoin_returns\n\n\ncum_sim_bitcoin = sim_bitcoin_returns.cumprod(axis=0)\ncum_sim_bitcoin.shape\n\n\ncum_sim_bitcoin.iloc[:, :1000].plot(legend=False, logy=True)\n# plt.legend(False)\n\n\nfuture = pd.DataFrame(data=cum_sim_bitcoin, index=index_future)\nfuture = future * bitcoin['Close'][-1]\n\n\npossible_prices = future.iloc[-1, :]\npossible_prices.name = 'Possible price'\n\n\nfuture.iloc[:, :200].plot(legend=False, logy=True, grid=True)\nyticks = plt.yticks()[0][1:-1]\nyticks = np.logspace(3, 6, 4)\nplt.yticks(yticks, ['{:,}'.format(t) for t in yticks])\nplt.title('Bitcoin price Monte Carlo simulations until Dec 31st 2018', size=16)\nplt.ylabel('Price ($)', size=12)\nplt.xlabel('Date', size=12)\n# plt.ylim(min(np.log10(future.iloc[:, :1000])), max(np.log10(future.iloc[:, :1000])))\n\n\nfuture.iloc[:, :200].plot(legend=False, logy=False, grid=True)\nplt.title('Bitcoin price Monte Carlo simulations until Dec 31st 2018', size=16)\nplt.ylabel('Price ($)', size=12)\nplt.xlabel('Date', size=12)\n\n\ny, x, _ = plt.hist(np.log(possible_prices), bins=200)\nxticks = plt.xticks()\nticks = np.linspace(min(np.log(possible_prices)),\n max(np.log(possible_prices)), 10)\nplt.xticks(ticks, [str(int(np.exp(tick))) for tick in ticks])\n\nplt.xlabel('Price ($)')\nplt.ylabel('Number of random walks')\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n\n\n\n\n# More logical ticks\n\n\n# Plot histogram of final simulated prices\nfig, ax = plt.subplots()\nax.hist(np.log(possible_prices), bins=150,\n density=True, label='Final simulated price')\nxticks = plt.xticks()\nhand_ticks = [100, 400, 1700, 6000, 24000,\n 90000, 340000, 1200000, 5000000, 18000000]\n# plt.xticks(np.log(hand_ticks), [str(tick) for tick in hand_ticks]);\nax.set_xticks(np.log(hand_ticks))\nax.set_xticklabels([str(tick) for tick in hand_ticks])\n\n# Plot KDE function\nkde = gaussian_kde(np.log(possible_prices))\nx = np.linspace(min(np.log(possible_prices)),\n max(np.log(possible_prices)), bins)\nax.plot(x, kde.pdf(x), linewidth=3, c='orange',\n alpha=1, label='KDE density function')\n\n# Plot vertical line at the most likely price\nmost_likely_price = np.exp(x[np.argmax(kde.pdf(x))])\n# most_likely_price = possible_prices.quantile(0.5)\nax.vlines(np.log(most_likely_price), 0, kde.pdf(\n np.log(most_likely_price)), color='w')\n\n# Draw annotation\nax.annotate('Most likely price: ${}'.format(int(round(most_likely_price))),\n xy=(np.log(most_likely_price), kde.pdf(np.log(most_likely_price))),\n xytext=(13, 0.3),\n arrowprops=dict(facecolor='black', shrink=0.0),\n size=14)\n\nplt.legend()\n\nplt.xlabel('Price ($) (log scale)', size=12)\nplt.ylabel('Density', size=12)\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n\nplt.show()\n\n\n# More logical ticks\n\nfig, ax = plt.subplots()\nax.hist(np.log(possible_prices), bins=150, density=True)\nxticks = plt.xticks()\nhand_ticks = [100, 400, 1700, 6000, 24000,\n 90000, 340000, 1200000, 5000000, 18000000]\n# plt.xticks(np.log(hand_ticks), [str(tick) for tick in hand_ticks]);\nax.set_xticks(np.log(hand_ticks))\nax.set_xticklabels([str(tick) for tick in hand_ticks])\n\n# # Plot KDE function\n# kde = gaussian_kde(np.log(possible_prices))\n# x = np.linspace(min(np.log(possible_prices)), max(np.log(possible_prices)), bins)\n# ax.plot(x, kde.pdf(x), linewidth=2, c='orange', alpha=1)\n\n# # # Plot vertical line at the most likely price\n# most_likely_price = np.exp(x[np.argmax(kde.pdf(x))])\n# # most_likely_price = possible_prices.quantile(0.5)\n# ax.vlines(np.log(most_likely_price), 0, kde.pdf(np.log(most_likely_price)), color='w')\n\n# # Draw annotation\n# ax.annotate('Most likely price: ${}'.format(int(round(most_likely_price))),\n# xy=(np.log(most_likely_price), kde.pdf(np.log(most_likely_price))),\n# xytext=(13, 0.3),\n# arrowprops=dict(facecolor='black', shrink=0.0),\n# size=14)\n\n\nplt.xlabel('Price ($) (log scale)', size=12)\nplt.ylabel('Density', size=12)\nplt.title('Distribution of bitcoin simulated prices by Dec 31st 2018', size=16)\n\n\nplt.show()\n\n\nmost_likely_price\n\n\npossible_prices.quantile(0.5)\n\n\npossible_prices.quantile(0.05)\n\n\npossible_prices.quantile(0.95)\n\n\n", "project_metadata": {"full_name": "xoelop/Misc", "description": "Some random stuff I've done", "topics": [], "git_url": "git://github.com/xoelop/Misc.git", "stars": 17, "watchers": 17, "forks": 9, "created": "2018-02-24T21:11:32Z", "size": 6974, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11964840}, "last_updated": "2020-06-04T17:34:09Z"}, "intent": "# 80% confidence interval"}, {"original_comment": "# Select random indices\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Pair-Wise Distance Analysis\n\n# ## Preliminaries\n\n#%%\n\nimport pickle\nimport samediff\nimport dp_align\nimport analyse_pairs\nimport sys\nimport seaborn as sns\nimport matplotlib.cbook as cbook\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nfrom tqdm import tqdm\nfrom scipy.spatial.distance import pdist\nfrom os import path\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsys.path.append(path.join(\"..\", \"..\", \"src\", \"speech_dtw\", \"utils\"))\n\n\n# ## Data\n\n#%%\n\n# Language and hash codes\nlanguage = \"SP\"\ncae_utd_hash = \"3507b1dee4\"\ncae_gt_hash = \"97d5e64521\"\n\nlanguage = \"HA\"\ncae_utd_hash = \"5addd62282\"\ncae_gt_hash = \"2962460ef0\"\n\nlanguage = \"CH\"\ncae_utd_hash = \"0af93a3cdf\"\ncae_gt_hash = \"8c7b5494d9\"\n\nlanguage = \"SW\"\ncae_utd_hash = \"d0e6bc9e00\"\ncae_gt_hash = \"9dc86566ea\"\n\nmultiling_hash = \"1482d0caf3\"\n\n#%%\n\n# Embedding filenames\ndownsample_fn = path.join(\"..\", \"downsample\", \"exp\",\n language, \"mfcc.dev.filter1_gt.downsample_10.npz\")\ncae_utd_fn = path.join(\"..\", \"embeddings\", \"models\", language + \".utd\", \"train_cae_rnn\",\n cae_utd_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\ncae_gt_fn = path.join(\"..\", \"embeddings\", \"models\", language + \".gt\", \"train_cae_rnn\",\n cae_gt_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\nmultiling_fn = path.join(\"..\", \"embeddings\", \"models\", \"RU+CZ+FR+PL+TH+PO.gt\", \"train_cae_rnn\",\n multiling_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\n\n#%%\n\n# Embeddings\nembeddings = {} # embeddings[\"downsample\"] gives the embeddings of a method\nembeddings[\"downsample\"] = np.load(downsample_fn)\nembeddings[\"cae_utd\"] = np.load(cae_utd_fn)\nembeddings[\"cae_gt\"] = np.load(cae_gt_fn)\nembeddings[\"multiling\"] = np.load(multiling_fn)\n\n#%%\n\n# Models\nmodels = [\"downsample\", \"cae_utd\", \"cae_gt\", \"multiling\"]\nmodel_titles = [\"Downsample\",\n \"CAE-RNN (UTD)\", \"CAE-RNN (GT)\", \"CAE-RNN (multiling.)\"]\n\n\n# ## Distances\n\n#%%\n\ndef process_embeddings(embeddings):\n print(\"Ordering embeddings:\")\n n_embeds = 0\n X = []\n utt_keys = []\n labels = []\n speakers = []\n for utt_key in tqdm(sorted(embeddings)):\n utt_keys.append(utt_key)\n X.append(embeddings[utt_key])\n utt_key = utt_key.split(\"_\")\n label = utt_key[0]\n speaker = utt_key[1]\n labels.append(label)\n speakers.append(speaker)\n X = np.array(X)\n print(\"No. embeddings:\", X.shape[0])\n print(\"Embedding dimensionality:\", X.shape[1])\n\n # Normalise\n normed = (X - X.mean(axis=0)) / X.std(axis=0)\n X = normed\n\n print(\"Calculating distances\")\n distances = pdist(X, metric=\"cosine\")\n\n return (utt_keys, labels, speakers, distances)\n\n#%%\n\ndistances = {}\nfor model in [\"downsample\", \"cae_utd\", \"cae_gt\", \"multiling\"]:\n print(\"Model:\", model)\n (cur_utt_keys, cur_labels, cur_speakers,\n cur_distances) = process_embeddings(embeddings[model])\n distances[model] = cur_distances\nutt_keys = cur_utt_keys\nlabels = cur_labels\nspeakers = cur_speakers\n\nprint(\"Calculating word matches\")\nword_matches = samediff.generate_matches_array(labels)\nprint(\"Total no. pairs:\", word_matches.shape[0])\nprint(\"No. same-word pairs:\", sum(word_matches))\nprint(\"Calculating speaker matches\")\nspeaker_matches = samediff.generate_matches_array(speakers)\nprint(\"No. same-speaker pairs:\", sum(speaker_matches))\n\n\n# ## Word type\n\n#%%\n\n# Distances\npos = {}\nneg = {}\nfor model in models:\n # Distances\n cur_pos = distances[model][word_matches == True]\n cur_neg = distances[model][word_matches == False]\n pos[model] = cur_pos\n neg[model] = cur_neg\n\n#%%\n\n# Dataframe\ndata_same_word = []\ndata_diff_word = []\nfor model in models:\n data_same_word += [[model, d, \"same\"] for d in pos[model]]\n data_diff_word += [[model, d, \"different\"] for d in neg[model]]\ndf_word = pd.DataFrame(data_diff_word + data_same_word,\n columns=[\"model\", \"distance\", \"match\"])\ndel data_same_word\ndel data_diff_word\n\n#%%\n\ndf_word\n\n#%%\n\n# Box plot\nfig, ax = plt.subplots(figsize=(10, 5))\n# ax.set_xticklabels(ax.get_xticklabels(), rotation=90);\nax = sns.boxplot(ax=ax, x=\"model\", y=\"distance\", hue=\"match\",\n data=df_word, order=models[:-1], showfliers=False)\nhandles, lables = ax.get_legend_handles_labels()\nax.legend(handles, [\"Different word\", \"Same word\"], loc=\"upper right\")\nax.set_xticklabels(model_titles[:-1])\nplt.xlabel(\"Models\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".word_boxplot.pdf\"))\n\n\n# ## Speaker identity\n\n#%%\n\n# Distances\npos = {}\nneg = {}\nfor model in models:\n cur_pos = distances[model][np.logical_and(word_matches, speaker_matches)]\n cur_neg = distances[model][np.logical_and(\n word_matches, speaker_matches == False)]\n pos[model] = cur_pos\n neg[model] = cur_neg\nprint(\"No. positive distances:\", len(cur_pos))\nprint(\"No. negative distances:\", len(cur_neg))\n\n# Dataframe\ndata_same_speaker = []\ndata_diff_speaker = []\nfor model in models:\n data_same_speaker += [[model, d, \"same\"] for d in pos[model]]\n data_diff_speaker += [[model, d, \"different\"] for d in neg[model]]\ndf_speaker = pd.DataFrame(\n data_same_speaker + data_diff_speaker, columns=[\"model\", \"distance\", \"match\"])\n\n#%%\n\n# Box plot\nfig, ax = plt.subplots(figsize=(10, 5))\nax = sns.boxplot(data=df_speaker, ax=ax, x=\"model\", y=\"distance\",\n hue=\"match\", order=models[:-1], showfliers=False)\nhandles, lables = ax.get_legend_handles_labels()\nax.legend(handles, [\"Different speaker\", \"Same speaker\"], loc=\"upper right\")\nax.set_xticklabels(model_titles[:-1])\nplt.xlabel(\"Models\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".speaker_boxplot.pdf\"))\n\n\n# ## Number of phones and segment duration\n\n#%%\n\n# Pronunciations\npron_fn = path.join(\"lists\", language, \"dev.prons\")\nprint(\"Reading:\", pron_fn)\npronunciations = analyse_pairs.read_pronunciations(pron_fn)\npron_labels = []\nfor utt_key in utt_keys:\n pron_labels.append(pronunciations[utt_key])\n\n# Get distances\nprint(\"Getting edit distances:\")\nedit_distances = analyse_pairs.editdistance_array(pron_labels)\n\n#%%\n\n# Save intermediate edit distances\nfn = path.join(\"doc\", language.lower() + \".edit_distances.pkl\")\nif not path.isfile(fn):\n print(\"Writing:\", fn)\n with open(fn, \"wb\") as f:\n pickle.dump(edit_distances, f, -1)\nelse:\n print(\"Reading:\", fn)\n with open(fn, \"rb\") as f:\n edit_distances = pickle.load(f)\n\n#%%\n\n# Collect distances\nedits = sorted(set(edit_distances))\ndata_edit_distance = []\naverages = {}\nstds = {}\nfor model in models:\n print(\"Collecting distances:\", model)\n averages[model] = []\n stds[model] = []\n for edit in tqdm(edits):\n averages[model].append(\n np.mean(distances[model][edit_distances == edit]))\n stds[model].append(np.std(distances[model][edit_distances == edit]))\n averages[model] = np.array(averages[model])\n stds[model] = np.array(stds[model])\n\n#%%\n\n# Plot\nfig, ax = plt.subplots(figsize=(10, 5))\nplt.plot(edits, averages[\"downsample\"], \"C0o-\",\n label=\"Downsample\", markeredgecolor=\"white\")\nplt.fill_between(edits, averages[\"downsample\"] - stds[\"downsample\"],\n averages[\"downsample\"] + stds[\"downsample\"], alpha=0.1, color=\"C0\")\nplt.plot(edits, averages[\"cae_utd\"], \"C1s-\",\n label=\"CAE-RNN (UTD)\", markeredgecolor=\"white\")\nplt.fill_between(edits, averages[\"cae_utd\"] - stds[\"cae_utd\"],\n averages[\"cae_utd\"] + stds[\"cae_utd\"], alpha=0.1, color=\"C1\")\nplt.plot(edits, averages[\"cae_gt\"], \"C2D-\",\n label=\"CAE-RNN (GT)\", markeredgecolor=\"white\")\n# plt.fill_between(edits, averages[\"cae_gt\"] - stds[\"cae_gt\"], averages[\"cae_gt\"] + stds[\"cae_gt\"], alpha=0.1, color=\"C2\")\nplt.xlim([0, 10])\nplt.legend()\nplt.xlabel(\"Phone edit distance\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".edit_distances.pdf\"))\n\n#%%\n\n# Distance vs. duration\n\ndef get_duration(utt_key):\n _, _, _, interval = utt_key.split(\"_\")\n interval = [int(i) for i in interval.split(\"-\")]\n return interval[1] - interval[0]\n\n\nN = len(utt_keys)\nduration_diffs = np.zeros(int(N*(N - 1)/2))\n\n# Calculate the absolute duration difference for every pair of labels\nprint(\"Calculating duration differences:\")\ncur_duration_i = 0\nfor n in tqdm(range(N - 1)):\n cur_utt_key = utt_keys[n]\n for i_offset, test_utt_key in enumerate(utt_keys[n + 1:]):\n duration_diffs[cur_duration_i + i_offset] = abs(\n get_duration(cur_utt_key) - get_duration(test_utt_key))\n cur_duration_i += N - n - 1\n\n#%%", "target_code": "np.random.seed(1)\nindices = np.arange(duration_diffs.shape[0])\nnp.random.shuffle(indices)\nindices = indices[:10000]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Pair-Wise Distance Analysis\n\n# ## Preliminaries\n\n\nimport pickle\nimport samediff\nimport dp_align\nimport analyse_pairs\nimport sys\nimport seaborn as sns\nimport matplotlib.cbook as cbook\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nfrom tqdm import tqdm\nfrom scipy.spatial.distance import pdist\nfrom os import path\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsys.path.append(path.join(\"..\", \"..\", \"src\", \"speech_dtw\", \"utils\"))\n\n\n# ## Data\n\n\n# Language and hash codes\nlanguage = \"SP\"\ncae_utd_hash = \"3507b1dee4\"\ncae_gt_hash = \"97d5e64521\"\n\nlanguage = \"HA\"\ncae_utd_hash = \"5addd62282\"\ncae_gt_hash = \"2962460ef0\"\n\nlanguage = \"CH\"\ncae_utd_hash = \"0af93a3cdf\"\ncae_gt_hash = \"8c7b5494d9\"\n\nlanguage = \"SW\"\ncae_utd_hash = \"d0e6bc9e00\"\ncae_gt_hash = \"9dc86566ea\"\n\nmultiling_hash = \"1482d0caf3\"\n\n\n# Embedding filenames\ndownsample_fn = path.join(\"..\", \"downsample\", \"exp\",\n language, \"mfcc.dev.filter1_gt.downsample_10.npz\")\ncae_utd_fn = path.join(\"..\", \"embeddings\", \"models\", language + \".utd\", \"train_cae_rnn\",\n cae_utd_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\ncae_gt_fn = path.join(\"..\", \"embeddings\", \"models\", language + \".gt\", \"train_cae_rnn\",\n cae_gt_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\nmultiling_fn = path.join(\"..\", \"embeddings\", \"models\", \"RU+CZ+FR+PL+TH+PO.gt\", \"train_cae_rnn\",\n multiling_hash, \"cae.best_val.\" + language.lower() + \".dev.filter1_gt.npz\")\n\n\n# Embeddings\nembeddings = {} # embeddings[\"downsample\"] gives the embeddings of a method\nembeddings[\"downsample\"] = np.load(downsample_fn)\nembeddings[\"cae_utd\"] = np.load(cae_utd_fn)\nembeddings[\"cae_gt\"] = np.load(cae_gt_fn)\nembeddings[\"multiling\"] = np.load(multiling_fn)\n\n\n# Models\nmodels = [\"downsample\", \"cae_utd\", \"cae_gt\", \"multiling\"]\nmodel_titles = [\"Downsample\",\n \"CAE-RNN (UTD)\", \"CAE-RNN (GT)\", \"CAE-RNN (multiling.)\"]\n\n\n# ## Distances\n\n\ndef process_embeddings(embeddings):\n print(\"Ordering embeddings:\")\n n_embeds = 0\n X = []\n utt_keys = []\n labels = []\n speakers = []\n for utt_key in tqdm(sorted(embeddings)):\n utt_keys.append(utt_key)\n X.append(embeddings[utt_key])\n utt_key = utt_key.split(\"_\")\n label = utt_key[0]\n speaker = utt_key[1]\n labels.append(label)\n speakers.append(speaker)\n X = np.array(X)\n print(\"No. embeddings:\", X.shape[0])\n print(\"Embedding dimensionality:\", X.shape[1])\n\n # Normalise\n normed = (X - X.mean(axis=0)) / X.std(axis=0)\n X = normed\n\n print(\"Calculating distances\")\n distances = pdist(X, metric=\"cosine\")\n\n return (utt_keys, labels, speakers, distances)\n\n\ndistances = {}\nfor model in [\"downsample\", \"cae_utd\", \"cae_gt\", \"multiling\"]:\n print(\"Model:\", model)\n (cur_utt_keys, cur_labels, cur_speakers,\n cur_distances) = process_embeddings(embeddings[model])\n distances[model] = cur_distances\nutt_keys = cur_utt_keys\nlabels = cur_labels\nspeakers = cur_speakers\n\nprint(\"Calculating word matches\")\nword_matches = samediff.generate_matches_array(labels)\nprint(\"Total no. pairs:\", word_matches.shape[0])\nprint(\"No. same-word pairs:\", sum(word_matches))\nprint(\"Calculating speaker matches\")\nspeaker_matches = samediff.generate_matches_array(speakers)\nprint(\"No. same-speaker pairs:\", sum(speaker_matches))\n\n\n# ## Word type\n\n\n# Distances\npos = {}\nneg = {}\nfor model in models:\n # Distances\n cur_pos = distances[model][word_matches == True]\n cur_neg = distances[model][word_matches == False]\n pos[model] = cur_pos\n neg[model] = cur_neg\n\n\n# Dataframe\ndata_same_word = []\ndata_diff_word = []\nfor model in models:\n data_same_word += [[model, d, \"same\"] for d in pos[model]]\n data_diff_word += [[model, d, \"different\"] for d in neg[model]]\ndf_word = pd.DataFrame(data_diff_word + data_same_word,\n columns=[\"model\", \"distance\", \"match\"])\ndel data_same_word\ndel data_diff_word\n\n\ndf_word\n\n\n# Box plot\nfig, ax = plt.subplots(figsize=(10, 5))\n# ax.set_xticklabels(ax.get_xticklabels(), rotation=90);\nax = sns.boxplot(ax=ax, x=\"model\", y=\"distance\", hue=\"match\",\n data=df_word, order=models[:-1], showfliers=False)\nhandles, lables = ax.get_legend_handles_labels()\nax.legend(handles, [\"Different word\", \"Same word\"], loc=\"upper right\")\nax.set_xticklabels(model_titles[:-1])\nplt.xlabel(\"Models\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".word_boxplot.pdf\"))\n\n\n# ## Speaker identity\n\n\n# Distances\npos = {}\nneg = {}\nfor model in models:\n cur_pos = distances[model][np.logical_and(word_matches, speaker_matches)]\n cur_neg = distances[model][np.logical_and(\n word_matches, speaker_matches == False)]\n pos[model] = cur_pos\n neg[model] = cur_neg\nprint(\"No. positive distances:\", len(cur_pos))\nprint(\"No. negative distances:\", len(cur_neg))\n\n# Dataframe\ndata_same_speaker = []\ndata_diff_speaker = []\nfor model in models:\n data_same_speaker += [[model, d, \"same\"] for d in pos[model]]\n data_diff_speaker += [[model, d, \"different\"] for d in neg[model]]\ndf_speaker = pd.DataFrame(\n data_same_speaker + data_diff_speaker, columns=[\"model\", \"distance\", \"match\"])\n\n\n# Box plot\nfig, ax = plt.subplots(figsize=(10, 5))\nax = sns.boxplot(data=df_speaker, ax=ax, x=\"model\", y=\"distance\",\n hue=\"match\", order=models[:-1], showfliers=False)\nhandles, lables = ax.get_legend_handles_labels()\nax.legend(handles, [\"Different speaker\", \"Same speaker\"], loc=\"upper right\")\nax.set_xticklabels(model_titles[:-1])\nplt.xlabel(\"Models\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".speaker_boxplot.pdf\"))\n\n\n# ## Number of phones and segment duration\n\n\n# Pronunciations\npron_fn = path.join(\"lists\", language, \"dev.prons\")\nprint(\"Reading:\", pron_fn)\npronunciations = analyse_pairs.read_pronunciations(pron_fn)\npron_labels = []\nfor utt_key in utt_keys:\n pron_labels.append(pronunciations[utt_key])\n\n# Get distances\nprint(\"Getting edit distances:\")\nedit_distances = analyse_pairs.editdistance_array(pron_labels)\n\n\n# Save intermediate edit distances\nfn = path.join(\"doc\", language.lower() + \".edit_distances.pkl\")\nif not path.isfile(fn):\n print(\"Writing:\", fn)\n with open(fn, \"wb\") as f:\n pickle.dump(edit_distances, f, -1)\nelse:\n print(\"Reading:\", fn)\n with open(fn, \"rb\") as f:\n edit_distances = pickle.load(f)\n\n\n# Collect distances\nedits = sorted(set(edit_distances))\ndata_edit_distance = []\naverages = {}\nstds = {}\nfor model in models:\n print(\"Collecting distances:\", model)\n averages[model] = []\n stds[model] = []\n for edit in tqdm(edits):\n averages[model].append(\n np.mean(distances[model][edit_distances == edit]))\n stds[model].append(np.std(distances[model][edit_distances == edit]))\n averages[model] = np.array(averages[model])\n stds[model] = np.array(stds[model])\n\n\n# Plot\nfig, ax = plt.subplots(figsize=(10, 5))\nplt.plot(edits, averages[\"downsample\"], \"C0o-\",\n label=\"Downsample\", markeredgecolor=\"white\")\nplt.fill_between(edits, averages[\"downsample\"] - stds[\"downsample\"],\n averages[\"downsample\"] + stds[\"downsample\"], alpha=0.1, color=\"C0\")\nplt.plot(edits, averages[\"cae_utd\"], \"C1s-\",\n label=\"CAE-RNN (UTD)\", markeredgecolor=\"white\")\nplt.fill_between(edits, averages[\"cae_utd\"] - stds[\"cae_utd\"],\n averages[\"cae_utd\"] + stds[\"cae_utd\"], alpha=0.1, color=\"C1\")\nplt.plot(edits, averages[\"cae_gt\"], \"C2D-\",\n label=\"CAE-RNN (GT)\", markeredgecolor=\"white\")\n# plt.fill_between(edits, averages[\"cae_gt\"] - stds[\"cae_gt\"], averages[\"cae_gt\"] + stds[\"cae_gt\"], alpha=0.1, color=\"C2\")\nplt.xlim([0, 10])\nplt.legend()\nplt.xlabel(\"Phone edit distance\")\nplt.ylabel(\"Distance\")\nplt.savefig(path.join(\"doc\", language.lower() + \".edit_distances.pdf\"))\n\n\n# Distance vs. duration\n\ndef get_duration(utt_key):\n _, _, _, interval = utt_key.split(\"_\")\n interval = [int(i) for i in interval.split(\"-\")]\n return interval[1] - interval[0]\n\n\nN = len(utt_keys)\nduration_diffs = np.zeros(int(N*(N - 1)/2))\n\n# Calculate the absolute duration difference for every pair of labels\nprint(\"Calculating duration differences:\")\ncur_duration_i = 0\nfor n in tqdm(range(N - 1)):\n cur_utt_key = utt_keys[n]\n for i_offset, test_utt_key in enumerate(utt_keys[n + 1:]):\n duration_diffs[cur_duration_i + i_offset] = abs(\n get_duration(cur_utt_key) - get_duration(test_utt_key))\n cur_duration_i += N - n - 1\n\n", "project_metadata": {"full_name": "kamperh/globalphone_awe", "description": "Multilingual acoustic word embedding approaches applied and evaluated on GlobalPhone data.", "topics": [], "git_url": "git://github.com/kamperh/globalphone_awe.git", "stars": 9, "watchers": 9, "forks": 4, "created": "2020-02-05T13:18:47Z", "size": 3672, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1141290, "Python": 382967, "Shell": 4338, "Makefile": 20}, "last_updated": "2020-12-29T13:40:52Z"}, "intent": "# Select random indices"}, {"original_comment": "# Run this cell to define training and validation generators\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Programming Assignment\n\n# ## Data pipeline with Keras and tf.data\n\n# ### Instructions\n#\n# In this notebook, you will implement a data processing pipeline using tools from both Keras and the tf.data module. You will use the `ImageDataGenerator` class in the tf.keras module to feed a network with training and test images from a local directory containing a subset of the LSUN dataset, and train the model both with and without data augmentation. You will then use the `map` and `filter` functions of the `Dataset` class with the CIFAR-100 dataset to train a network to classify a processed subset of the images.\n#\n# Some code cells are provided you in the notebook. You should avoid editing provided code, and make sure to execute the cells in order to avoid unexpected errors. Some cells begin with the line:\n#\n# `#### GRADED CELL ####`\n#\n# Don't move or edit this first line - this is what the automatic grader looks for to recognise graded cells. These cells require you to write your own code to complete them, and are automatically graded when you submit the notebook. Don't edit the function name or signature provided in these cells, otherwise the automatic grader might not function properly. Inside these graded cells, you can use any functions or classes that are imported below, but make sure you don't use any variables that are outside the scope of the function.\n#\n# ### How to submit\n#\n# Complete all the tasks you are asked for in the worksheet. When you have finished and are happy with your code, press the **Submit Assignment** button at the top of this notebook.\n#\n# ### Let's get started!\n#\n# We'll start running some imports, and loading the dataset. Do not edit the existing imports in the following cell. If you would like to make further Tensorflow imports, you should add them here.\n\n#%%\n\n#### PACKAGE IMPORTS ####\n\n# Run this cell first to import all required packages. Do not make any imports elsewhere in the notebook\n\nfrom tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Dropout\nfrom tensorflow.keras.models import Sequential, Model\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nimport tensorflow as tf\nfrom tensorflow.keras.datasets import cifar100\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport json\nget_ipython().run_line_magic('matplotlib', 'inline')\n# If you would like to make further imports from tensorflow, add them here\n\n\n# ### Part 1: tf.keras\n# \n# \n# \n# \n#
\"Church\" \"Classroom\" \"Conference
\n#\n# #### The LSUN Dataset\n#\n# In the first part of this assignment, you will use a subset of the [LSUN dataset](https://www.yf.io/p/lsun). This is a large-scale image dataset with 10 scene and 20 object categories. A subset of the LSUN dataset has been provided, and has already been split into training and test sets. The three classes included in the subset are `church_outdoor`, `classroom` and `conference_room`.\n#\n# * F. Yu, A. Seff, Y. Zhang, S. Song, T. Funkhouser and J. Xia. \"LSUN: Construction of a Large-scale Image Dataset using Deep Learning with Humans in the Loop\". arXiv:1506.03365, 10 Jun 2015\n#\n# Your goal is to use the Keras preprocessing tools to construct a data ingestion and augmentation pipeline to train a neural network to classify the images into the three classes.\n\n#%%\n\n# Save the directory locations for the training, validation and test sets\n\ntrain_dir = 'data/lsun/train'\nvalid_dir = 'data/lsun/valid'\ntest_dir = 'data/lsun/test'\n\n\n# #### Create a data generator using the ImageDataGenerator class\n\n# You should first write a function that creates an `ImageDataGenerator` object, which rescales the image pixel values by a factor of 1/255.\n\n#%%\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure to not change the function name or arguments.\n\ndef get_ImageDataGenerator():\n return ImageDataGenerator(rescale=(1/255))\n \"\"\"\n This function should return an instance of the ImageDataGenerator class.\n This instance should be set up to rescale the data with the above scaling factor.\n \"\"\"\n\n#%%\n\n# Call the function to get an ImageDataGenerator as specified\nimage_gen = get_ImageDataGenerator()\n\n\n# You should now write a function that returns a generator object that will yield batches of images and labels from the training and test set directories. The generators should:\n#\n# * Generate batches of size 20.\n# * Resize the images to 64 x 64 x 3.\n# * Return one-hot vectors for labels. These should be encoded as follows:\n# * `classroom` $\\rightarrow$ `[1., 0., 0.]`\n# * `conference_room` $\\rightarrow$ `[0., 1., 0.]`\n# * `church_outdoor` $\\rightarrow$ `[0., 0., 1.]`\n# * Pass in an optional random `seed` for shuffling (this should be passed into the `flow_from_directory` method).\n#\n# **Hint:** you may need to refer to the [documentation](https://keras.io/preprocessing/image/#imagedatagenerator-class) for the `ImageDataGenerator`.\n\n#%%\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef get_generator(image_data_generator, directory, seed=None):\n\n return image_data_generator.flow_from_directory(directory, target_size=(64, 64),\n classes=['classroom', 'conference_room', 'church_outdoor'], class_mode=\"categorical\",\n batch_size=20, seed=seed)\n\n \"\"\"\n This function takes an ImageDataGenerator object in the first argument and a \n directory path in the second argument.\n It should use the ImageDataGenerator to return a generator object according \n to the above specifications. \n The seed argument should be passed to the flow_from_directory method.\n \"\"\"\n\n#%%\n\n# Run this cell to define training and validation generators\ntrain_generator = get_generator(image_gen, train_dir)\nvalid_generator = get_generator(image_gen, valid_dir)\n\n\n# We are using a small subset of the dataset for demonstrative purposes in this assignment.\n\n# #### Display sample images and labels from the training set\n#\n# The following cell depends on your function `get_generator` to be implemented correctly. If it raises an error, go back and check the function specifications carefully.\n\n#%%\n\n# Display a few images and labels from the training set\n\nbatch = next(train_generator)\nbatch_images = np.array(batch[0])\nbatch_labels = np.array(batch[1])\nlsun_classes = ['classroom', 'conference_room', 'church_outdoor']\n\nplt.figure(figsize=(16, 10))\nfor i in range(20):\n ax = plt.subplot(4, 5, i+1)\n plt.imshow(batch_images[i])\n plt.title(lsun_classes[np.where(batch_labels[i] == 1.)[0][0]])\n plt.axis('off')\n\n#%%\n\n# Reset the training generator\n\ntrain_generator = get_generator(image_gen, train_dir)\n\n\n# #### Build the neural network model\n#\n# You will now build and compile a convolutional neural network classifier. Using the functional API, build your model according to the following specifications:\n#\n# * The model should use the `input_shape` in the function argument to define the Input layer.\n# * The first hidden layer should be a Conv2D layer with 8 filters, a 8x8 kernel size.\n# * The second hidden layer should be a MaxPooling2D layer with a 2x2 pooling window size.\n# * The third hidden layer should be a Conv2D layer with 4 filters, a 4x4 kernel size.\n# * The fourth hidden layer should be a MaxPooling2D layer with a 2x2 pooling window size.\n# * This should be followed by a Flatten layer, and then a Dense layer with 16 units and ReLU activation.\n# * The final layer should be a Dense layer with 3 units and softmax activation.\n# * All Conv2D layers should use `\"SAME\"` padding and a ReLU activation function.\n#\n# In total, the network should have 8 layers. The model should then be compiled with the Adam optimizer with learning rate 0.0005, categorical cross entropy loss, and categorical accuracy metric.\n\n#%%\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef get_model(input_shape):\n inputs = Input(shape=input_shape)\n x = Conv2D(8, 8, activation='relu', padding='same')(inputs)\n x = MaxPooling2D(2)(x)\n x = Conv2D(4, 4, activation='relu', padding='same')(x)\n x = MaxPooling2D(2)(x)\n x = Flatten()(x)\n x = Dense(16, activation='relu')(x)\n outputs = Dense(3, activation='softmax')(x)\n model = Model(inputs=inputs, outputs=outputs)\n adam = tf.keras.optimizers.Adam(learning_rate=0.0005)\n model.compile(optimizer=adam, loss='categorical_crossentropy',\n metrics=['accuracy'])\n return model\n \"\"\"\n This function should build and compile a CNN model according to the above specification,\n using the functional API. Your function should return the model.\n \"\"\"\n\n#%%\n\n# Build and compile the model, print the model summary\nlsun_model = get_model((64, 64, 3))\nlsun_model.summary()\n\n\n# #### Train the neural network model\n#\n# You should now write a function to train the model for a specified number of epochs (specified in the `epochs` argument). The function takes a `model` argument, as well as `train_gen` and `valid_gen` arguments for the training and validation generators respectively, which you should use for training and validation data in the training run. You should also use the following callbacks:\n#\n# * An `EarlyStopping` callback that monitors the validation accuracy and has patience set to 10.\n# * A `ReduceLROnPlateau` callback that monitors the validation loss and has the factor set to 0.5 and minimum learning set to 0.0001\n#\n# Your function should return the training history.\n\n#%%\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef train_model(model, train_gen, valid_gen, epochs):\n earlystopping = tf.keras.callbacks.EarlyStopping(patience=10)\n reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,\n min_lr=0.0001)\n history = model.fit_generator(train_gen, epochs=epochs, validation_data=valid_gen,\n callbacks=[earlystopping, reduce_lr])\n return history\n \"\"\"\n This function should define the callback objects specified above, and then use the\n train_gen and valid_gen generator object arguments to train the model for the (maximum) \n number of epochs specified in the function argument, using the defined callbacks.\n The function should return the training history.\n \"\"\"\n\n#%%\n\n# Train the model for (maximum) 50 epochs\nhistory = train_model(lsun_model, train_generator, valid_generator, epochs=50)\n\n\n# #### Plot the learning curves\n\n#%%\n\n# Run this cell to plot accuracy vs epoch and loss vs epoch\n\nplt.figure(figsize=(15, 5))\nplt.subplot(121)\ntry:\n plt.plot(history.history['accuracy'])\n plt.plot(history.history['val_accuracy'])\nexcept KeyError:\n try:\n plt.plot(history.history['acc'])\n plt.plot(history.history['val_acc'])\n except KeyError:\n plt.plot(history.history['categorical_accuracy'])\n plt.plot(history.history['val_categorical_accuracy'])\nplt.title('Accuracy vs. epochs')\nplt.ylabel('Accuracy')\nplt.xlabel('Epoch')\nplt.legend(['Training', 'Validation'], loc='lower right')\n\nplt.subplot(122)\nplt.plot(history.history['loss'])\nplt.plot(history.history['val_loss'])\nplt.title('Loss vs. epochs')\nplt.ylabel('Loss')\nplt.xlabel('Epoch')\nplt.legend(['Training', 'Validation'], loc='upper right')\nplt.show()\n\n\n# You may notice overfitting in the above plots, through a growing discrepancy between the training and validation loss and accuracy. We will aim to mitigate this using data augmentation. Given our limited dataset, we may be able to improve the performance by applying random modifications to the images in the training data, effectively increasing the size of the dataset.\n\n# #### Create a new data generator with data augmentation\n#\n# You should now write a function to create a new `ImageDataGenerator` object, which performs the following data preprocessing and augmentation:\n#\n# * Scales the image pixel values by a factor of 1/255.\n# * Randomly rotates images by up to 30 degrees\n# * Randomly alters the brightness (picks a brightness shift value) from the range (0.5, 1.5)\n# * Randomly flips images horizontally\n#\n# Hint: you may need to refer to the [documentation](https://keras.io/preprocessing/image/#imagedatagenerator-class) for the `ImageDataGenerator`.\n\n#%%\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure to not change the function name or arguments.\n\ndef get_ImageDataGenerator_augmented():\n return ImageDataGenerator(rescale=1/255, rotation_range=30, brightness_range=(0.5, 1.5), horizontal_flip=True)\n \"\"\"\n This function should return an instance of the ImageDataGenerator class \n with the above specifications.\n \"\"\"\n\n#%%\n\n# Call the function to get an ImageDataGenerator as specified\nimage_gen_aug = get_ImageDataGenerator_augmented()\n\n#%%", "target_code": "valid_generator_aug = get_generator(image_gen_aug, valid_dir)\ntrain_generator_aug = get_generator(image_gen_aug, train_dir, seed=10)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Programming Assignment\n\n# ## Data pipeline with Keras and tf.data\n\n# ### Instructions\n#\n# In this notebook, you will implement a data processing pipeline using tools from both Keras and the tf.data module. You will use the `ImageDataGenerator` class in the tf.keras module to feed a network with training and test images from a local directory containing a subset of the LSUN dataset, and train the model both with and without data augmentation. You will then use the `map` and `filter` functions of the `Dataset` class with the CIFAR-100 dataset to train a network to classify a processed subset of the images.\n#\n# Some code cells are provided you in the notebook. You should avoid editing provided code, and make sure to execute the cells in order to avoid unexpected errors. Some cells begin with the line:\n#\n# `#### GRADED CELL ####`\n#\n# Don't move or edit this first line - this is what the automatic grader looks for to recognise graded cells. These cells require you to write your own code to complete them, and are automatically graded when you submit the notebook. Don't edit the function name or signature provided in these cells, otherwise the automatic grader might not function properly. Inside these graded cells, you can use any functions or classes that are imported below, but make sure you don't use any variables that are outside the scope of the function.\n#\n# ### How to submit\n#\n# Complete all the tasks you are asked for in the worksheet. When you have finished and are happy with your code, press the **Submit Assignment** button at the top of this notebook.\n#\n# ### Let's get started!\n#\n# We'll start running some imports, and loading the dataset. Do not edit the existing imports in the following cell. If you would like to make further Tensorflow imports, you should add them here.\n\n\n#### PACKAGE IMPORTS ####\n\n# Run this cell first to import all required packages. Do not make any imports elsewhere in the notebook\n\nfrom tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Dropout\nfrom tensorflow.keras.models import Sequential, Model\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nimport tensorflow as tf\nfrom tensorflow.keras.datasets import cifar100\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport json\nget_ipython().run_line_magic('matplotlib', 'inline')\n# If you would like to make further imports from tensorflow, add them here\n\n\n# ### Part 1: tf.keras\n# \n# \n# \n# \n#
\"Church\" \"Classroom\" \"Conference
\n#\n# #### The LSUN Dataset\n#\n# In the first part of this assignment, you will use a subset of the [LSUN dataset](https://www.yf.io/p/lsun). This is a large-scale image dataset with 10 scene and 20 object categories. A subset of the LSUN dataset has been provided, and has already been split into training and test sets. The three classes included in the subset are `church_outdoor`, `classroom` and `conference_room`.\n#\n# * F. Yu, A. Seff, Y. Zhang, S. Song, T. Funkhouser and J. Xia. \"LSUN: Construction of a Large-scale Image Dataset using Deep Learning with Humans in the Loop\". arXiv:1506.03365, 10 Jun 2015\n#\n# Your goal is to use the Keras preprocessing tools to construct a data ingestion and augmentation pipeline to train a neural network to classify the images into the three classes.\n\n\n# Save the directory locations for the training, validation and test sets\n\ntrain_dir = 'data/lsun/train'\nvalid_dir = 'data/lsun/valid'\ntest_dir = 'data/lsun/test'\n\n\n# #### Create a data generator using the ImageDataGenerator class\n\n# You should first write a function that creates an `ImageDataGenerator` object, which rescales the image pixel values by a factor of 1/255.\n\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure to not change the function name or arguments.\n\ndef get_ImageDataGenerator():\n return ImageDataGenerator(rescale=(1/255))\n \"\"\"\n This function should return an instance of the ImageDataGenerator class.\n This instance should be set up to rescale the data with the above scaling factor.\n \"\"\"\n\n\n# Call the function to get an ImageDataGenerator as specified\nimage_gen = get_ImageDataGenerator()\n\n\n# You should now write a function that returns a generator object that will yield batches of images and labels from the training and test set directories. The generators should:\n#\n# * Generate batches of size 20.\n# * Resize the images to 64 x 64 x 3.\n# * Return one-hot vectors for labels. These should be encoded as follows:\n# * `classroom` $\\rightarrow$ `[1., 0., 0.]`\n# * `conference_room` $\\rightarrow$ `[0., 1., 0.]`\n# * `church_outdoor` $\\rightarrow$ `[0., 0., 1.]`\n# * Pass in an optional random `seed` for shuffling (this should be passed into the `flow_from_directory` method).\n#\n# **Hint:** you may need to refer to the [documentation](https://keras.io/preprocessing/image/#imagedatagenerator-class) for the `ImageDataGenerator`.\n\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef get_generator(image_data_generator, directory, seed=None):\n\n return image_data_generator.flow_from_directory(directory, target_size=(64, 64),\n classes=['classroom', 'conference_room', 'church_outdoor'], class_mode=\"categorical\",\n batch_size=20, seed=seed)\n\n \"\"\"\n This function takes an ImageDataGenerator object in the first argument and a \n directory path in the second argument.\n It should use the ImageDataGenerator to return a generator object according \n to the above specifications. \n The seed argument should be passed to the flow_from_directory method.\n \"\"\"\n\n\n# Run this cell to define training and validation generators\ntrain_generator = get_generator(image_gen, train_dir)\nvalid_generator = get_generator(image_gen, valid_dir)\n\n\n# We are using a small subset of the dataset for demonstrative purposes in this assignment.\n\n# #### Display sample images and labels from the training set\n#\n# The following cell depends on your function `get_generator` to be implemented correctly. If it raises an error, go back and check the function specifications carefully.\n\n\n# Display a few images and labels from the training set\n\nbatch = next(train_generator)\nbatch_images = np.array(batch[0])\nbatch_labels = np.array(batch[1])\nlsun_classes = ['classroom', 'conference_room', 'church_outdoor']\n\nplt.figure(figsize=(16, 10))\nfor i in range(20):\n ax = plt.subplot(4, 5, i+1)\n plt.imshow(batch_images[i])\n plt.title(lsun_classes[np.where(batch_labels[i] == 1.)[0][0]])\n plt.axis('off')\n\n\n# Reset the training generator\n\ntrain_generator = get_generator(image_gen, train_dir)\n\n\n# #### Build the neural network model\n#\n# You will now build and compile a convolutional neural network classifier. Using the functional API, build your model according to the following specifications:\n#\n# * The model should use the `input_shape` in the function argument to define the Input layer.\n# * The first hidden layer should be a Conv2D layer with 8 filters, a 8x8 kernel size.\n# * The second hidden layer should be a MaxPooling2D layer with a 2x2 pooling window size.\n# * The third hidden layer should be a Conv2D layer with 4 filters, a 4x4 kernel size.\n# * The fourth hidden layer should be a MaxPooling2D layer with a 2x2 pooling window size.\n# * This should be followed by a Flatten layer, and then a Dense layer with 16 units and ReLU activation.\n# * The final layer should be a Dense layer with 3 units and softmax activation.\n# * All Conv2D layers should use `\"SAME\"` padding and a ReLU activation function.\n#\n# In total, the network should have 8 layers. The model should then be compiled with the Adam optimizer with learning rate 0.0005, categorical cross entropy loss, and categorical accuracy metric.\n\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef get_model(input_shape):\n inputs = Input(shape=input_shape)\n x = Conv2D(8, 8, activation='relu', padding='same')(inputs)\n x = MaxPooling2D(2)(x)\n x = Conv2D(4, 4, activation='relu', padding='same')(x)\n x = MaxPooling2D(2)(x)\n x = Flatten()(x)\n x = Dense(16, activation='relu')(x)\n outputs = Dense(3, activation='softmax')(x)\n model = Model(inputs=inputs, outputs=outputs)\n adam = tf.keras.optimizers.Adam(learning_rate=0.0005)\n model.compile(optimizer=adam, loss='categorical_crossentropy',\n metrics=['accuracy'])\n return model\n \"\"\"\n This function should build and compile a CNN model according to the above specification,\n using the functional API. Your function should return the model.\n \"\"\"\n\n\n# Build and compile the model, print the model summary\nlsun_model = get_model((64, 64, 3))\nlsun_model.summary()\n\n\n# #### Train the neural network model\n#\n# You should now write a function to train the model for a specified number of epochs (specified in the `epochs` argument). The function takes a `model` argument, as well as `train_gen` and `valid_gen` arguments for the training and validation generators respectively, which you should use for training and validation data in the training run. You should also use the following callbacks:\n#\n# * An `EarlyStopping` callback that monitors the validation accuracy and has patience set to 10.\n# * A `ReduceLROnPlateau` callback that monitors the validation loss and has the factor set to 0.5 and minimum learning set to 0.0001\n#\n# Your function should return the training history.\n\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure not to change the function name or arguments.\n\ndef train_model(model, train_gen, valid_gen, epochs):\n earlystopping = tf.keras.callbacks.EarlyStopping(patience=10)\n reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,\n min_lr=0.0001)\n history = model.fit_generator(train_gen, epochs=epochs, validation_data=valid_gen,\n callbacks=[earlystopping, reduce_lr])\n return history\n \"\"\"\n This function should define the callback objects specified above, and then use the\n train_gen and valid_gen generator object arguments to train the model for the (maximum) \n number of epochs specified in the function argument, using the defined callbacks.\n The function should return the training history.\n \"\"\"\n\n\n# Train the model for (maximum) 50 epochs\nhistory = train_model(lsun_model, train_generator, valid_generator, epochs=50)\n\n\n# #### Plot the learning curves\n\n\n# Run this cell to plot accuracy vs epoch and loss vs epoch\n\nplt.figure(figsize=(15, 5))\nplt.subplot(121)\ntry:\n plt.plot(history.history['accuracy'])\n plt.plot(history.history['val_accuracy'])\nexcept KeyError:\n try:\n plt.plot(history.history['acc'])\n plt.plot(history.history['val_acc'])\n except KeyError:\n plt.plot(history.history['categorical_accuracy'])\n plt.plot(history.history['val_categorical_accuracy'])\nplt.title('Accuracy vs. epochs')\nplt.ylabel('Accuracy')\nplt.xlabel('Epoch')\nplt.legend(['Training', 'Validation'], loc='lower right')\n\nplt.subplot(122)\nplt.plot(history.history['loss'])\nplt.plot(history.history['val_loss'])\nplt.title('Loss vs. epochs')\nplt.ylabel('Loss')\nplt.xlabel('Epoch')\nplt.legend(['Training', 'Validation'], loc='upper right')\nplt.show()\n\n\n# You may notice overfitting in the above plots, through a growing discrepancy between the training and validation loss and accuracy. We will aim to mitigate this using data augmentation. Given our limited dataset, we may be able to improve the performance by applying random modifications to the images in the training data, effectively increasing the size of the dataset.\n\n# #### Create a new data generator with data augmentation\n#\n# You should now write a function to create a new `ImageDataGenerator` object, which performs the following data preprocessing and augmentation:\n#\n# * Scales the image pixel values by a factor of 1/255.\n# * Randomly rotates images by up to 30 degrees\n# * Randomly alters the brightness (picks a brightness shift value) from the range (0.5, 1.5)\n# * Randomly flips images horizontally\n#\n# Hint: you may need to refer to the [documentation](https://keras.io/preprocessing/image/#imagedatagenerator-class) for the `ImageDataGenerator`.\n\n\n#### GRADED CELL ####\n\n# Complete the following function.\n# Make sure to not change the function name or arguments.\n\ndef get_ImageDataGenerator_augmented():\n return ImageDataGenerator(rescale=1/255, rotation_range=30, brightness_range=(0.5, 1.5), horizontal_flip=True)\n \"\"\"\n This function should return an instance of the ImageDataGenerator class \n with the above specifications.\n \"\"\"\n\n\n# Call the function to get an ImageDataGenerator as specified\nimage_gen_aug = get_ImageDataGenerator_augmented()\n\n\n", "project_metadata": {"full_name": "Prakadeeswaran05/Customising-your-models-with-TensorFlow-2-Coursera", "description": null, "topics": ["tensorflow", "coursera", "tensorflow2", "imperial-college-london"], "git_url": "git://github.com/Prakadeeswaran05/Customising-your-models-with-TensorFlow-2-Coursera.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-10-04T11:17:49Z", "size": 2060, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2730480}, "last_updated": "2021-01-05T08:17:24Z"}, "intent": "# define training and validation generators"}, {"original_comment": "# * Checking the distrubution of numerical columns.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Introduction\n#\n# > **[Real or Fake Job Description Prediction](https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction)**\n#\n# This dataset is designed to classify a job offer as fake or real, It contains 17880 samples of job descriptions out of which about **866 are fake** and **17014** are real. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models that can learn the fraudulent job descriptions.\n#\n# **Acknowledgements**\n#\n# [The University of the Aegean | Laboratory of Information & Communication Systems Security](http://emscad.samos.aegean.gr/)\n#\n# **Inspiration**\n#\n# The dataset is very valuable as it can be used to answer the following questions:\n#\n# Create a classification model that uses text data features and meta-features TO predict which job description is fraudulent or real.\n#\n# Identify key traits/features (words, entities, phrases) of job descriptions that are fraudulent.\n#\n# Run a contextual embedding model to identify the most similar job descriptions.\n# Perform Exploratory Data Analysis on the dataset to identify interesting insights from this dataset.\n#\n#\n# ---\n# ---\n# **Summary of our work**\n#\n# In this notebook, we use the pipeline of the text classification provided by [Googl ML developers guide](https://developers.google.com/machine-learning/guides/text-classification/step-2-5) and Google Brain instructors, see Fig. 1.\n#\n# **Problems**\n# in this dataset there are many problems :\n#\n# 1 - label bias,\n# 2 - the small number of samples.\n# 3 - Feature with 80% missed values (nan).\n#\n# The solution that proposes:\n# - adding labeled samples of fake jobs to\n#\n# The solutions we used to process this data:\n# * **ML model**\n# - Using n-gram to represent the world and (tfidf or countvector)\n# - Support Vector machine\n# - Random Forest\n# - MultinomialNB\n# * **Sequencial Models with Embeddings **\n# - Sequential embeddings\n# - LSTM\n# * **Augmanted data approach**\n#\n# loading Petrained weights of big NN learned from huge Text datasets and add it to the ML algorithm.\n# - BERT ([repo](https://github.com/amaiya/ktrain/blob/master/examples/text/20newsgroup-distilbert.ipynb), [paper](https://arxiv.org/abs/1810.04805))\n#\n# ML/DL and Text frameworks used in this project :\n#\n# - Tensorflow\n# - Keras\n# - Scikitlearn\n# - SpaCy\n# - Ktrain\n# - Tansorboard: to visualize and save the results.\n#\n#\n# **Note:** this notebook is for the recruiting test (machine learning assignment) sent by 'Campany X' for An Apprenticeship position - Machine learning NLP engineer.\n#\n# ---\n# ---\n\n# ![\"Text classification flowchart\"](https://developers.google.com/machine-learning/guides/text-classification/images/TextClassificationFlowchart.png)\n# *Fig. 1: Text classification flowchart*\n\n# # packages\n\n#%%\n\n# uncomment this to check the packages\n# pip list\n\n#%%\n\n# @title\n# un-comment this cell if you are not using COLAB\n# !pip install json\n# !pip install numpy\n# !pip install pandas\n# !pip install seaborn\n# !pip install matplotlib\n# !pip install missingno\n# !pip install csv\n# !pip install sklearn\n# !pip install tensorflow\n# !pip install spacy\n# !pip install nltk\n\n\n# # Import\n\n# Download the data from G drive to local machine.\n\n#%%\n\n# @title\nfrom sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score, classification_report, plot_confusion_matrix\nimport unicodedata\nimport string\nimport re\nimport spacy\nfrom sklearn.svm import SVC\nfrom tensorboard.plugins import projector\nimport os\nimport pickle\nfrom nltk.stem import LancasterStemmer, WordNetLemmatizer\nfrom nltk.corpus import stopwords\nfrom nltk import word_tokenize, sent_tokenize\nimport inflect\nimport nltk\nfrom spacy.lang.en import English\nfrom spacy.lang.en.stop_words import STOP_WORDS\nfrom sklearn.feature_selection import f_classif\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom keras.callbacks import EarlyStopping\nfrom keras.preprocessing import sequence, text\nfrom keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\nfrom keras.layers.normalization import BatchNormalization\nfrom keras.layers.embeddings import Embedding\nfrom keras.layers.core import Dense, Activation, Dropout\nfrom keras.layers.recurrent import LSTM, GRU\nfrom keras.models import Sequential\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow import keras\nimport tensorflow as tf\nfrom sklearn import svm\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn import model_selection\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.base import TransformerMixin\nfrom sklearn import preprocessing, model_selection, pipeline\nimport sklearn\nimport datetime\nimport csv\nimport missingno\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport json\nfrom oauth2client.client import GoogleCredentials\nfrom google.colab import auth\nfrom pydrive.drive import GoogleDrive\nfrom pydrive.auth import GoogleAuth\nget_ipython().system('pip install -U -q PyDrive')\n# to download the csv file from the drive,\n# This only needs to be done once per notebook.\n\n\n# Authenticate and create the PyDrive client.\n# This only needs to be done once per notebook.\nauth.authenticate_user()\ngauth = GoogleAuth()\ngauth.credentials = GoogleCredentials.get_application_default()\ndrive = GoogleDrive(gauth)\n\n'''Download a file with G ID.\nthe link of csv file that we share from drive, the file is share and any one with the link can edit.\n # https://drive.google.com/file/d/1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi\n # A file ID looks like: 1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi\n'''\n\nfileid1 = '1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi'\ndownloaded1 = drive.CreateFile({'id': fileid1})\ndownloaded1.GetContentFile(\"fake_job_postings\")\n\n#%%\n\n# @title\n# use this code to import the csv file from the G drive if you have the csv file in you own drive\n# 1. upload the CSV file to\n# 2. run this\n# from google.colab import drive\n# drive.mount('/content/drive')\n# 3. use this line to import the csv\n# note :change the path\n#data = pd.read_csv(\"/content/drive/My Drive/Datasets/fake_job_postings.csv\")\n\n#%%\n\n# @title\n# scikitlearn\n# tensorflow & keras\n# SpaCy\n# nltk\n# to save models\n\n\n# * Loading the csv file using pandas\n\n#%%\n\ndata = pd.read_csv(\"/content/fake_job_postings\")\n\n\n# # 1.Exploring the data\n#\n\n# * Show DF Dimension and content\n\n#%%\n\nprint(\"Data dimension :\", data.shape)\ndata.head()\n\n\n# * Checking the data type of columns\n\n#%%\n\ndata.dtypes", "target_code": "data.describe()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Introduction\n#\n# > **[Real or Fake Job Description Prediction](https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction)**\n#\n# This dataset is designed to classify a job offer as fake or real, It contains 17880 samples of job descriptions out of which about **866 are fake** and **17014** are real. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models that can learn the fraudulent job descriptions.\n#\n# **Acknowledgements**\n#\n# [The University of the Aegean | Laboratory of Information & Communication Systems Security](http://emscad.samos.aegean.gr/)\n#\n# **Inspiration**\n#\n# The dataset is very valuable as it can be used to answer the following questions:\n#\n# Create a classification model that uses text data features and meta-features TO predict which job description is fraudulent or real.\n#\n# Identify key traits/features (words, entities, phrases) of job descriptions that are fraudulent.\n#\n# Run a contextual embedding model to identify the most similar job descriptions.\n# Perform Exploratory Data Analysis on the dataset to identify interesting insights from this dataset.\n#\n#\n# ---\n# ---\n# **Summary of our work**\n#\n# In this notebook, we use the pipeline of the text classification provided by [Googl ML developers guide](https://developers.google.com/machine-learning/guides/text-classification/step-2-5) and Google Brain instructors, see Fig. 1.\n#\n# **Problems**\n# in this dataset there are many problems :\n#\n# 1 - label bias,\n# 2 - the small number of samples.\n# 3 - Feature with 80% missed values (nan).\n#\n# The solution that proposes:\n# - adding labeled samples of fake jobs to\n#\n# The solutions we used to process this data:\n# * **ML model**\n# - Using n-gram to represent the world and (tfidf or countvector)\n# - Support Vector machine\n# - Random Forest\n# - MultinomialNB\n# * **Sequencial Models with Embeddings **\n# - Sequential embeddings\n# - LSTM\n# * **Augmanted data approach**\n#\n# loading Petrained weights of big NN learned from huge Text datasets and add it to the ML algorithm.\n# - BERT ([repo](https://github.com/amaiya/ktrain/blob/master/examples/text/20newsgroup-distilbert.ipynb), [paper](https://arxiv.org/abs/1810.04805))\n#\n# ML/DL and Text frameworks used in this project :\n#\n# - Tensorflow\n# - Keras\n# - Scikitlearn\n# - SpaCy\n# - Ktrain\n# - Tansorboard: to visualize and save the results.\n#\n#\n# **Note:** this notebook is for the recruiting test (machine learning assignment) sent by 'Campany X' for An Apprenticeship position - Machine learning NLP engineer.\n#\n# ---\n# ---\n\n# ![\"Text classification flowchart\"](https://developers.google.com/machine-learning/guides/text-classification/images/TextClassificationFlowchart.png)\n# *Fig. 1: Text classification flowchart*\n\n# # packages\n\n\n# uncomment this to check the packages\n# pip list\n\n\n# @title\n# un-comment this cell if you are not using COLAB\n# !pip install json\n# !pip install numpy\n# !pip install pandas\n# !pip install seaborn\n# !pip install matplotlib\n# !pip install missingno\n# !pip install csv\n# !pip install sklearn\n# !pip install tensorflow\n# !pip install spacy\n# !pip install nltk\n\n\n# # Import\n\n# Download the data from G drive to local machine.\n\n\n# @title\nfrom sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score, classification_report, plot_confusion_matrix\nimport unicodedata\nimport string\nimport re\nimport spacy\nfrom sklearn.svm import SVC\nfrom tensorboard.plugins import projector\nimport os\nimport pickle\nfrom nltk.stem import LancasterStemmer, WordNetLemmatizer\nfrom nltk.corpus import stopwords\nfrom nltk import word_tokenize, sent_tokenize\nimport inflect\nimport nltk\nfrom spacy.lang.en import English\nfrom spacy.lang.en.stop_words import STOP_WORDS\nfrom sklearn.feature_selection import f_classif\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom keras.callbacks import EarlyStopping\nfrom keras.preprocessing import sequence, text\nfrom keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\nfrom keras.layers.normalization import BatchNormalization\nfrom keras.layers.embeddings import Embedding\nfrom keras.layers.core import Dense, Activation, Dropout\nfrom keras.layers.recurrent import LSTM, GRU\nfrom keras.models import Sequential\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow import keras\nimport tensorflow as tf\nfrom sklearn import svm\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn import model_selection\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.base import TransformerMixin\nfrom sklearn import preprocessing, model_selection, pipeline\nimport sklearn\nimport datetime\nimport csv\nimport missingno\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\nimport numpy as np\nimport json\nfrom oauth2client.client import GoogleCredentials\nfrom google.colab import auth\nfrom pydrive.drive import GoogleDrive\nfrom pydrive.auth import GoogleAuth\nget_ipython().system('pip install -U -q PyDrive')\n# to download the csv file from the drive,\n# This only needs to be done once per notebook.\n\n\n# Authenticate and create the PyDrive client.\n# This only needs to be done once per notebook.\nauth.authenticate_user()\ngauth = GoogleAuth()\ngauth.credentials = GoogleCredentials.get_application_default()\ndrive = GoogleDrive(gauth)\n\n'''Download a file with G ID.\nthe link of csv file that we share from drive, the file is share and any one with the link can edit.\n # https://drive.google.com/file/d/1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi\n # A file ID looks like: 1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi\n'''\n\nfileid1 = '1E5eVot87ahE0mjvCfo-vkW8aA_AEENNi'\ndownloaded1 = drive.CreateFile({'id': fileid1})\ndownloaded1.GetContentFile(\"fake_job_postings\")\n\n\n# @title\n# use this code to import the csv file from the G drive if you have the csv file in you own drive\n# 1. upload the CSV file to\n# 2. run this\n# from google.colab import drive\n# drive.mount('/content/drive')\n# 3. use this line to import the csv\n# note :change the path\n#data = pd.read_csv(\"/content/drive/My Drive/Datasets/fake_job_postings.csv\")\n\n\n# @title\n# scikitlearn\n# tensorflow & keras\n# SpaCy\n# nltk\n# to save models\n\n\n# * Loading the csv file using pandas\n\n\ndata = pd.read_csv(\"/content/fake_job_postings\")\n\n\n# # 1.Exploring the data\n#\n\n# * Show DF Dimension and content\n\n\nprint(\"Data dimension :\", data.shape)\ndata.head()\n\n\n# * Checking the data type of columns\n\n\ndata.dtypes\n\n\n\n", "project_metadata": {"full_name": "YoucefBYu/Job-Hunter-Project", "description": "In order to help students and job seekers in their journey of job hunting, from our experience we got the idea to start the project after the successful result of the first prototype that we build for personal use (massive job hunting). The app extract the data of given job (link of Linkedin or any job search platform) and use it to build a short insight that focus on the important key words, required Skills, level of education and information about the company. The extracted job data and the user data (Resume, Profile) will be used as input of the processing box (the sniper agency), It has Intelligente agent that use many tools and technique to produce results for example : The NLP text generator (we call it the philosopher) that produce a perfect motivation letter based on the input and some other predefined data. the second agent \"The advisor\" : it help the user to know the job skill set that he needs to master before applying to the job he is interested in, even we suggest online courses based on users rating, in general we use prediction models in this part.", "topics": ["job", "nlp", "deep", "openai", "gpt-2", "keras", "google", "google-cloud-platform", "tensorflow", "google-services", "app-engine", "neo4j-database", "adversarial-attacks", "textattack", "firebase"], "git_url": "git://github.com/YoucefBYu/Job-Hunter-Project.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-14T14:23:00Z", "size": 12850, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 38359641}, "last_updated": "2020-12-15T15:35:10Z"}, "intent": "# * check distrubution of numerical columns"}, {"original_comment": "# show the column names of the dataset\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# import necessary libraries\nimport plotly.express as px\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\n# read in the dataset from the url\ndata = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')\ndata.sample(10)\n\n#%%", "target_code": "data.columns\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# import necessary libraries\nimport plotly.express as px\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# read in the dataset from the url\ndata = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')\ndata.sample(10)\n\n", "project_metadata": {"full_name": "sumusa/Data-Visualization-Course", "description": "Data Visualization using d3.js", "topics": [], "git_url": "git://github.com/sumusa/Data-Visualization-Course.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-09-22T15:31:19Z", "size": 13432, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8857129, "HTML": 4332688, "JavaScript": 28419, "R": 13170, "CSS": 48}, "last_updated": "2020-12-15T08:22:53Z"}, "intent": "# show the column names of the dataset"}, {"original_comment": "# Plot reduced features in two dimensions using seaborn scatterplot\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport umap\nfrom sklearn.manifold import TSNE\nfrom sklearn.decomposition import PCA\nimport tensorflow as tf\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\n\n\n# For this assignment we will be using the fashion MNIST dataset.\n# As stated in the original [Github repository](https://github.com/zalandoresearch/fashion-mnist):\n# >Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.\n#\n# One of the easiest ways to load the dataset is by using tf.keras. Our dataset will consist of 3 dimensional numpy arrays with shapes : (samples, rows, columns). In order to use the data with libraries that require 2 dimensional arrays with shapes (samples, features) we will reshape each sample by flattening each image. Instead of having an array of shape (28, 28) for each image we will now have a vector of length 784 (28x28=784).\n#\n# Note that below we only load the trainigng samples which we will use to perform dimensionality reduction.\n\n#%%\n\n(X_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data()\n\n\n# We keep the class names in the following list and we also create another list named `y_train_labels` which contains the the labels for each sample. We will use these both as labels for our plots.\n\n#%%\n\nclass_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n\nclass_dict = dict(enumerate(class_names))\n\ny_train_labels = np.vectorize(class_dict.get)(y_train)\n\n\n# We also define a function that will print a grid 5x5 of photos from the provided subset.\n\n#%%\n\ndef plot_images(X, y):\n plt.figure(figsize=(10, 10))\n for i in range(25):\n plt.subplot(5, 5, i+1)\n plt.xticks([])\n plt.yticks([])\n plt.grid(False)\n plt.imshow(X[i], cmap=plt.cm.binary)\n plt.xlabel(class_names[y[i]])\n plt.show()\n\n#%%\n\nplot_images(X_train, y_train)\n\n\n# We can use images in python (and other programming languages of course) as arrays/matrices. In python a byte image of 28x28 will be represented by a numpy array of shape (28,28) with each pixel/cell having a value in the range [0, 255] with 0 being black and 255 white. The values between represent shades of gray. On the same note if we had an RBG image with colors, the same array would be a 3-dimensional numpy array of shape (28, 28, 3) where each pixel is represented by three values in the range [0, 255] for each color (Red, Green, Blue).\n#\n# We have seen feature scaling in the past. In order to be able to use our images with methods that are scale sensitive we divide all values by 255, thus bringing everything in the [0, 1] range.\n\n#%%\n\nX_train = X_train/255\n\n\n# Scikit learn methods require 2-dimensional datasets (number_of_rows, number_of_features). We tranform each image from 28 x 28 pixels to a flat array of 784 length. To find the number of features we multiply the last two dimensions.\n\n#%%\n\nnumber_of_features = X_train.shape[1] * X_train.shape[2]\nX_train_reshaped = X_train.reshape(-1, number_of_features)\n\n\n# When we plot images we will require to reshape them back to 28x28 images.\n#\n# Also we select just a subset (1000) of our data samples and perform our exercises on this one in order to speed up execution, as our purpose is to showcase this. For more accurate/better results, we would need to do this on the whole dataset.\n\n#%%\n\nX_train_sub = X_train_reshaped[:1000]\ny_train_labels_sub = y_train_labels[:1000]\n\n#%%\n\n# Run PCA and keep the first two components\npca = PCA(n_components=2)\nX_train_pca = pca.fit_transform(X_train_sub)\n\n#%%\n\n# Plot reduced data in two dimensions using seaborn scatterplot\nsns.scatterplot(X_train_pca[:, 0], X_train_pca[:, 1], hue=y_train_labels_sub)\nplt.show()\n\n\n# We can now inverse transform the reduced data. This means that we transform it back to the original space by using only the number of components we run PCA on (2 in our case). This will let us understand what dimensionality reduction really means. We keep a subset of these latent features that maximize the explained variance. With these plots we get an idea of what \"maximum explained variance\" actually means. We get images that vaguely look like our initial articles, but without the details.\n\n#%%\n\ninversed = pca.inverse_transform(X_train_pca).reshape(-1, 28, 28)\n\n#%%\n\nplot_images(inversed, y_train)\n\n\n# Create t-SNE models with different amounts of perplexity (try between single and triple digits?) and plot their reduced features.\n\n#%%\n\n# Run TSNE for each of these perplexity values and transform X_train_sub\ntsne = TSNE(perplexity=5)\ntsne2 = TSNE(perplexity=10)\ntsne3 = TSNE(perplexity=50)\ntsne4 = TSNE(perplexity=100)\n\nX_tsne = tsne.fit_transform(X_train_sub)\nX_tsne2 = tsne2.fit_transform(X_train_sub)\nX_tsne3 = tsne3.fit_transform(X_train_sub)\nX_tsne4 = tsne4.fit_transform(X_train_sub)", "target_code": "sns.scatterplot(X_tsne[:, 0], X_tsne[:, 1], hue=y_train_labels_sub)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport umap\nfrom sklearn.manifold import TSNE\nfrom sklearn.decomposition import PCA\nimport tensorflow as tf\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\n\n\n# For this assignment we will be using the fashion MNIST dataset.\n# As stated in the original [Github repository](https://github.com/zalandoresearch/fashion-mnist):\n# >Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits.\n#\n# One of the easiest ways to load the dataset is by using tf.keras. Our dataset will consist of 3 dimensional numpy arrays with shapes : (samples, rows, columns). In order to use the data with libraries that require 2 dimensional arrays with shapes (samples, features) we will reshape each sample by flattening each image. Instead of having an array of shape (28, 28) for each image we will now have a vector of length 784 (28x28=784).\n#\n# Note that below we only load the trainigng samples which we will use to perform dimensionality reduction.\n\n\n(X_train, y_train), (_, _) = tf.keras.datasets.fashion_mnist.load_data()\n\n\n# We keep the class names in the following list and we also create another list named `y_train_labels` which contains the the labels for each sample. We will use these both as labels for our plots.\n\n\nclass_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n\nclass_dict = dict(enumerate(class_names))\n\ny_train_labels = np.vectorize(class_dict.get)(y_train)\n\n\n# We also define a function that will print a grid 5x5 of photos from the provided subset.\n\n\ndef plot_images(X, y):\n plt.figure(figsize=(10, 10))\n for i in range(25):\n plt.subplot(5, 5, i+1)\n plt.xticks([])\n plt.yticks([])\n plt.grid(False)\n plt.imshow(X[i], cmap=plt.cm.binary)\n plt.xlabel(class_names[y[i]])\n plt.show()\n\n\nplot_images(X_train, y_train)\n\n\n# We can use images in python (and other programming languages of course) as arrays/matrices. In python a byte image of 28x28 will be represented by a numpy array of shape (28,28) with each pixel/cell having a value in the range [0, 255] with 0 being black and 255 white. The values between represent shades of gray. On the same note if we had an RBG image with colors, the same array would be a 3-dimensional numpy array of shape (28, 28, 3) where each pixel is represented by three values in the range [0, 255] for each color (Red, Green, Blue).\n#\n# We have seen feature scaling in the past. In order to be able to use our images with methods that are scale sensitive we divide all values by 255, thus bringing everything in the [0, 1] range.\n\n\nX_train = X_train/255\n\n\n# Scikit learn methods require 2-dimensional datasets (number_of_rows, number_of_features). We tranform each image from 28 x 28 pixels to a flat array of 784 length. To find the number of features we multiply the last two dimensions.\n\n\nnumber_of_features = X_train.shape[1] * X_train.shape[2]\nX_train_reshaped = X_train.reshape(-1, number_of_features)\n\n\n# When we plot images we will require to reshape them back to 28x28 images.\n#\n# Also we select just a subset (1000) of our data samples and perform our exercises on this one in order to speed up execution, as our purpose is to showcase this. For more accurate/better results, we would need to do this on the whole dataset.\n\n\nX_train_sub = X_train_reshaped[:1000]\ny_train_labels_sub = y_train_labels[:1000]\n\n\n# Run PCA and keep the first two components\npca = PCA(n_components=2)\nX_train_pca = pca.fit_transform(X_train_sub)\n\n\n# Plot reduced data in two dimensions using seaborn scatterplot\nsns.scatterplot(X_train_pca[:, 0], X_train_pca[:, 1], hue=y_train_labels_sub)\nplt.show()\n\n\n# We can now inverse transform the reduced data. This means that we transform it back to the original space by using only the number of components we run PCA on (2 in our case). This will let us understand what dimensionality reduction really means. We keep a subset of these latent features that maximize the explained variance. With these plots we get an idea of what \"maximum explained variance\" actually means. We get images that vaguely look like our initial articles, but without the details.\n\n\ninversed = pca.inverse_transform(X_train_pca).reshape(-1, 28, 28)\n\n\nplot_images(inversed, y_train)\n\n\n# Create t-SNE models with different amounts of perplexity (try between single and triple digits?) and plot their reduced features.\n\n\n# Run TSNE for each of these perplexity values and transform X_train_sub\ntsne = TSNE(perplexity=5)\ntsne2 = TSNE(perplexity=10)\ntsne3 = TSNE(perplexity=50)\ntsne4 = TSNE(perplexity=100)\n\nX_tsne = tsne.fit_transform(X_train_sub)\nX_tsne2 = tsne2.fit_transform(X_train_sub)\nX_tsne3 = tsne3.fit_transform(X_train_sub)\nX_tsne4 = tsne4.fit_transform(X_train_sub)\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "intent": "# Plot reduced features in two dimensions using seaborn scatterplot"}, {"original_comment": "# Resize the figure for better display in the notebook\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Eigenvalue/Eigenvector Analysis
for Undamped Systems

\n#

MCHE 485: Mechanical Vibrations

\n#

Dr. Joshua Vaughan
\n# joshua.vaughan@louisiana.edu
\n# http://www.ucs.louisiana.edu/~jev9637/

\n\n#

\n# \t\"A
\n# Figure 1: A Two-Mass-Spring System\n#

\n#\n# This notebook demonstrates the eigenvalue/eigenvector problem using a two-mass-spring-damper system shown in Figure 1. We'll just look at one example set of parameters. The same techniques apply for other parameters and for larger matrices.\n#\n# The equations of motion for the system are:\n#\n# $ \\quad m_1 \\ddot{x}_1 + (k_1+k_2)x_1 - k_2 x_2 = 0 $\n#\n# $ \\quad m_2 \\ddot{x}_2 -k_2 x_1 +(k_2 + k_3)x_2 = 0 $\n#\n# We could also write these equations in matrix form:\n#\n# $ \\quad \\begin{bmatrix}m_1 & 0 \\\\ 0 & m_2\\end{bmatrix}\\begin{bmatrix}\\ddot{x}_1 \\\\ \\ddot{x}_2\\end{bmatrix} + \\begin{bmatrix}k_1 + k_2 & -k_2 \\\\ -k_2 & k_2 + k_3\\end{bmatrix}\\begin{bmatrix}x_1 \\\\ x_2\\end{bmatrix} = \\begin{bmatrix}0 \\\\ 0\\end{bmatrix}$\n#\n# Define\n#\n# $ \\quad M = \\begin{bmatrix}m_1 & 0 \\\\ 0 & m_2\\end{bmatrix} $\n#\n# and\n#\n# $ \\quad K = \\begin{bmatrix}k_1 + k_2 & -k_2 \\\\ -k_2 & k_2 + k_3\\end{bmatrix} $\n#\n# Using $M$ and $K$, we want to solve:\n#\n# $ \\quad \\left[K - \\omega^2 M\\right]\\bar{X} = 0 $\n#\n# for $\\bar{X}$. This is an eigenvalue problem.\n#\n# For information on how to obtain these equations, you can see the lectures at the [class website](http://www.ucs.louisiana.edu/~jev9637/MCHE485.html).\n#\n# We'll use the [Scipy version of the linear algebra module](http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.linalg.eigh.html). It allows us to solve the \"general\" eignevalue problem.\n\n#%%\n\nfrom scipy.integrate import odeint\nimport urllib.request\nfrom IPython.core.display import HTML\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n#%%\n\n# We want our plots to be displayed inline, not in a separate window\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Import the plotting functions\n\n#%%\n\n# Let's also improve the printing of NumPy arrays.\nnp.set_printoptions(precision=3, suppress=True)\n\n\n# To see how to solve this eigenvalue problem, we will use the parameters from the example in the book, set up below. All three spring constants are equal and the two masses are equal.\n\n#%%\n\n# Define the matrices\nm1 = 1.0\nm2 = 1.0\n\nk1 = 2 * np.pi**2\nk2 = 4.0\nk3 = 4.0\n\nM = np.asarray([[m1, 0],\n [0, m2]])\n\nK = np.asarray([[k1, -k1],\n [-k1, k1]])\n\n# M = np.asarray([[m1, 0],\n# [0, m2]])\n\n# K = np.asarray([[k1, k2],\n# [k1*m2/m1 - k3, (k2 + k3 - k2*m2/m1)]])\n\n#%%\n\n# We'll use the scipy version of the linear algebra\n\neigenvals, eigenvects = linalg.eigh(K, M)\n\n\n#\n# The linalg.eigh function returns two arrays, one of the eigenvalues and one of the eigenvectors. The eigenvalues are the square of the two natural frequencies. The eigenvectors are returned in normalized form, with each \"column\" of the array representing an eigenvector.\n#\n\n#%%\n\nprint('\\n')\nprint('The resulting eigenalues are {:.2f} and {:.2f}.'.format(\n eigenvals[0], eigenvals[1]))\nprint('\\n')\nprint('So the two natural frequencies are {:.2f}rad/s and {:.2f}rad/s.'.format(\n np.sqrt(eigenvals[0]), np.sqrt(eigenvals[1])))\nprint('\\n')\n\n#%%\n\nprint('\\n')\nprint('The first eigenvector is ' + str(eigenvects[:, 0]) + '.')\nprint('\\n')\nprint('The second eigenvector is ' + str(eigenvects[:, 1]) + '.')\nprint('\\n')\n\n\n# # Responses\n# Now, let's look at the response and see how it reflects these two modes\n\n#%%\n\n# Define the equations of motion\n\n# Define the system as a series of 1st order ODEs (beginnings of state-space form)\ndef eq_of_motion(w, t, p):\n \"\"\"\n Defines the differential equations for the coupled spring-mass system.\n\n Arguments:\n w : vector of the state variables:\n w = [x1, x1_dot, x2, x2_dot]\n t : time\n p : vector of the parameters:\n p = [m1, m2, k1, k2, k3]\n \"\"\"\n x1, x1_dot, x2, x2_dot = w\n m1, m2, k1, k2, k3 = p\n\n # Create sysODE = (x1', x1_dot', x2', x2_dot')\n sysODE = [x1_dot,\n (-(k1+k2)*x1 + k2*x2) / m1,\n x2_dot,\n (k2*x1 - (k2+k3)*x2) / m2]\n\n return sysODE\n\n#%%\n\n# Import the ODE solver\n\n# Set up simulation parameters\n\n# ODE solver parameters\nabserr = 1.0e-9\nrelerr = 1.0e-9\nmax_step = 0.01\nstoptime = 10.0\nnumpoints = 10001\n\n# Create the time samples for the output of the ODE solver\nt = np.linspace(0.0, stoptime, numpoints)\n\n\n# ## Mode 1\n# Let's start by looking at the first mode. For this set of parameters ($m_1 = m_2$ and $k_1 = k_2 = k_3$), the two masses move identically. To excite only this mode, we'll choose initial conditions that exactly match the mode shape.\n#\n# Here, we'll choose:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = 0$\n\n#%%\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = 0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, k1, k2, k3]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n#%%\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_1.pdf')\n\n# Resize the figure for better display in the notebook\nfig.set_size_inches(9, 6)\n\n\n# ## Mode 2\n# Now, let's look at the second mode. For this set of parameters ($m_1 = m_2$ and $k_1 = k_2 = k_3$), the two masses move exactly opposite of one another in the second mode. To excite only this mode, we'll choose initial conditions that exactly match the mode shape.\n#\n# Here, we'll choose:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = 0$\n\n#%%\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = -0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, k1, k2, k3]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_2.pdf')", "target_code": "fig.set_size_inches(9, 6)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Eigenvalue/Eigenvector Analysis
for Undamped Systems

\n#

MCHE 485: Mechanical Vibrations

\n#

Dr. Joshua Vaughan
\n# joshua.vaughan@louisiana.edu
\n# http://www.ucs.louisiana.edu/~jev9637/

\n\n#

\n# \t\"A
\n# Figure 1: A Two-Mass-Spring System\n#

\n#\n# This notebook demonstrates the eigenvalue/eigenvector problem using a two-mass-spring-damper system shown in Figure 1. We'll just look at one example set of parameters. The same techniques apply for other parameters and for larger matrices.\n#\n# The equations of motion for the system are:\n#\n# $ \\quad m_1 \\ddot{x}_1 + (k_1+k_2)x_1 - k_2 x_2 = 0 $\n#\n# $ \\quad m_2 \\ddot{x}_2 -k_2 x_1 +(k_2 + k_3)x_2 = 0 $\n#\n# We could also write these equations in matrix form:\n#\n# $ \\quad \\begin{bmatrix}m_1 & 0 \\\\ 0 & m_2\\end{bmatrix}\\begin{bmatrix}\\ddot{x}_1 \\\\ \\ddot{x}_2\\end{bmatrix} + \\begin{bmatrix}k_1 + k_2 & -k_2 \\\\ -k_2 & k_2 + k_3\\end{bmatrix}\\begin{bmatrix}x_1 \\\\ x_2\\end{bmatrix} = \\begin{bmatrix}0 \\\\ 0\\end{bmatrix}$\n#\n# Define\n#\n# $ \\quad M = \\begin{bmatrix}m_1 & 0 \\\\ 0 & m_2\\end{bmatrix} $\n#\n# and\n#\n# $ \\quad K = \\begin{bmatrix}k_1 + k_2 & -k_2 \\\\ -k_2 & k_2 + k_3\\end{bmatrix} $\n#\n# Using $M$ and $K$, we want to solve:\n#\n# $ \\quad \\left[K - \\omega^2 M\\right]\\bar{X} = 0 $\n#\n# for $\\bar{X}$. This is an eigenvalue problem.\n#\n# For information on how to obtain these equations, you can see the lectures at the [class website](http://www.ucs.louisiana.edu/~jev9637/MCHE485.html).\n#\n# We'll use the [Scipy version of the linear algebra module](http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.linalg.eigh.html). It allows us to solve the \"general\" eignevalue problem.\n\n\nfrom scipy.integrate import odeint\nimport urllib.request\nfrom IPython.core.display import HTML\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n\n# We want our plots to be displayed inline, not in a separate window\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Import the plotting functions\n\n\n# Let's also improve the printing of NumPy arrays.\nnp.set_printoptions(precision=3, suppress=True)\n\n\n# To see how to solve this eigenvalue problem, we will use the parameters from the example in the book, set up below. All three spring constants are equal and the two masses are equal.\n\n\n# Define the matrices\nm1 = 1.0\nm2 = 1.0\n\nk1 = 2 * np.pi**2\nk2 = 4.0\nk3 = 4.0\n\nM = np.asarray([[m1, 0],\n [0, m2]])\n\nK = np.asarray([[k1, -k1],\n [-k1, k1]])\n\n# M = np.asarray([[m1, 0],\n# [0, m2]])\n\n# K = np.asarray([[k1, k2],\n# [k1*m2/m1 - k3, (k2 + k3 - k2*m2/m1)]])\n\n\n# We'll use the scipy version of the linear algebra\n\neigenvals, eigenvects = linalg.eigh(K, M)\n\n\n#\n# The linalg.eigh function returns two arrays, one of the eigenvalues and one of the eigenvectors. The eigenvalues are the square of the two natural frequencies. The eigenvectors are returned in normalized form, with each \"column\" of the array representing an eigenvector.\n#\n\n\nprint('\\n')\nprint('The resulting eigenalues are {:.2f} and {:.2f}.'.format(\n eigenvals[0], eigenvals[1]))\nprint('\\n')\nprint('So the two natural frequencies are {:.2f}rad/s and {:.2f}rad/s.'.format(\n np.sqrt(eigenvals[0]), np.sqrt(eigenvals[1])))\nprint('\\n')\n\n\nprint('\\n')\nprint('The first eigenvector is ' + str(eigenvects[:, 0]) + '.')\nprint('\\n')\nprint('The second eigenvector is ' + str(eigenvects[:, 1]) + '.')\nprint('\\n')\n\n\n# # Responses\n# Now, let's look at the response and see how it reflects these two modes\n\n\n# Define the equations of motion\n\n# Define the system as a series of 1st order ODEs (beginnings of state-space form)\ndef eq_of_motion(w, t, p):\n \"\"\"\n Defines the differential equations for the coupled spring-mass system.\n\n Arguments:\n w : vector of the state variables:\n w = [x1, x1_dot, x2, x2_dot]\n t : time\n p : vector of the parameters:\n p = [m1, m2, k1, k2, k3]\n \"\"\"\n x1, x1_dot, x2, x2_dot = w\n m1, m2, k1, k2, k3 = p\n\n # Create sysODE = (x1', x1_dot', x2', x2_dot')\n sysODE = [x1_dot,\n (-(k1+k2)*x1 + k2*x2) / m1,\n x2_dot,\n (k2*x1 - (k2+k3)*x2) / m2]\n\n return sysODE\n\n\n# Import the ODE solver\n\n# Set up simulation parameters\n\n# ODE solver parameters\nabserr = 1.0e-9\nrelerr = 1.0e-9\nmax_step = 0.01\nstoptime = 10.0\nnumpoints = 10001\n\n# Create the time samples for the output of the ODE solver\nt = np.linspace(0.0, stoptime, numpoints)\n\n\n# ## Mode 1\n# Let's start by looking at the first mode. For this set of parameters ($m_1 = m_2$ and $k_1 = k_2 = k_3$), the two masses move identically. To excite only this mode, we'll choose initial conditions that exactly match the mode shape.\n#\n# Here, we'll choose:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = 0$\n\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = 0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, k1, k2, k3]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_1.pdf')\n\n# Resize the figure for better display in the notebook\nfig.set_size_inches(9, 6)\n\n\n# ## Mode 2\n# Now, let's look at the second mode. For this set of parameters ($m_1 = m_2$ and $k_1 = k_2 = k_3$), the two masses move exactly opposite of one another in the second mode. To excite only this mode, we'll choose initial conditions that exactly match the mode shape.\n#\n# Here, we'll choose:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = 0$\n\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = -0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, k1, k2, k3]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_2.pdf')\n", "project_metadata": {"full_name": "DocVaughan/MCHE485---Mechanical-Vibrations", "description": "Code supporting MCHE485: Mechanical Vibrations at the Univsersity of Louisiana at Lafayette", "topics": ["mechanical-vibrations", "fft", "jupyter-notebook", "python", "education"], "git_url": "git://github.com/DocVaughan/MCHE485---Mechanical-Vibrations.git", "stars": 21, "watchers": 21, "forks": 18, "created": "2015-01-06T00:10:33Z", "size": 43628, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4005438, "TeX": 10648, "CSS": 2772}, "last_updated": "2020-12-23T16:48:18Z"}, "intent": "# Resize the figure for better display in the notebook"}, {"original_comment": "# Print the accuracy from the testing data.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification\n#\n\n# **Note:** We've adapted this Mini Project from [Lab 5 in the CS109](https://github.com/cs109/2015lab5) course. Please feel free to check out the original lab, both for more exercises, as well as solutions.\n\n# We turn our attention to **classification**. Classification tries to predict, which of a small set of classes, an observation belongs to. Mathematically, the aim is to find $y$, a **label** based on knowing a feature vector $\\x$. For instance, consider predicting gender from seeing a person's face, something we do fairly well as humans. To have a machine do this well, we would typically feed the machine a bunch of images of people which have been labelled \"male\" or \"female\" (the training set), and have it learn the gender of the person in the image from the labels and the *features* used to determine gender. Then, given a new photo, the trained algorithm returns us the gender of the person in the photo.\n#\n# There are different ways of making classifications. One idea is shown schematically in the image below, where we find a line that divides \"things\" of two different types in a 2-dimensional feature space. The classification show in the figure below is an example of a maximum-margin classifier where construct a decision boundary that is far as possible away from both classes of points. The fact that a line can be drawn to separate the two classes makes the problem *linearly separable*. Support Vector Machines (SVM) are an example of a maximum-margin classifier.\n#\n# ![Splitting using a single line](images/onelinesplit.png)\n#\n#\n\n#%%\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport sklearn.model_selection\nimport seaborn as sns\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nimport matplotlib.cm as cm\nimport matplotlib as mpl\nimport scipy as sp\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\npd.set_option('display.width', 500)\npd.set_option('display.max_columns', 100)\npd.set_option('display.notebook_repr_html', True)\nsns.set_style(\"whitegrid\")\nsns.set_context(\"poster\")\n\nc0 = sns.color_palette()[0]\nc1 = sns.color_palette()[1]\nc2 = sns.color_palette()[2]\n\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\ncm = plt.cm.RdBu\ncm_bright = ListedColormap(['#FF0000', '#0000FF'])\n\n\ndef points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light,\n cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):\n h = .02\n X = np.concatenate((Xtr, Xte))\n x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),\n np.linspace(y_min, y_max, 100))\n\n # plt.figure(figsize=(10,6))\n if zfunc:\n p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]\n p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n Z = zfunc(p0, p1)\n else:\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n ZZ = Z.reshape(xx.shape)\n if mesh:\n plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)\n if predicted:\n showtr = clf.predict(Xtr)\n showte = clf.predict(Xte)\n else:\n showtr = ytr\n showte = yte\n ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold,\n s=psize, alpha=alpha, edgecolor=\"k\")\n # and testing points\n ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold,\n alpha=alpha, marker=\"s\", s=psize+10)\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n return ax, xx, yy\n\n\ndef points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light,\n cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):\n ax, xx, yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False,\n colorscale=colorscale, cdiscrete=cdiscrete,\n psize=psize, alpha=alpha, predicted=True)\n Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n Z = Z.reshape(xx.shape)\n plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)\n cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)\n plt.clabel(cs2, fmt='%2.1f', colors='k', fontsize=14, axes=ax)\n return ax\n\n\n# ## A Motivating Example Using `sklearn`: Heights and Weights\n\n# We'll use a dataset of heights and weights of males and females to hone our understanding of classifiers. We load the data into a dataframe and plot it.\n\n#%%\n\ndflog = pd.read_csv(\"data/01_heights_weights_genders.csv\")\ndflog.head()\n\n\n# Remember that the form of data we will use always is\n#\n# ![dataform](images/data.png)\n#\n# with the \"response\" or \"label\" $y$ as a plain array of 0s and 1s for binary classification. Sometimes we will also see -1 and +1 instead. There are also *multiclass* classifiers that can assign an observation to one of $K > 2$ classes and the labe may then be an integer, but we will not be discussing those here.\n#\n# `y = [1,1,0,0,0,1,0,1,0....]`.\n\n#

Checkup Exercise Set I

\n#\n#
    \n#
  • Exercise: Create a scatter plot of Weight vs. Height\n#
  • Exercise: Color the points differently by Gender\n#
\n#\n\n#%%\n\n# your turn\nplt.scatter(dflog.Weight, dflog.Height, alpha=0.5, s=5)\nplt.xlabel('Weight')\nplt.ylabel('Height')\nplt.show()\n\n#%%\n\nsns.pairplot(data=dflog, hue='Gender', size=10, markers=['s', 'o'])\nplt.show()\n\n\n# ### Training and Test Datasets\n#\n# When fitting models, we would like to ensure two things:\n#\n# * We have found the best model (in terms of model parameters).\n# * The model is highly likely to generalize i.e. perform well on unseen data.\n#\n#
\n#

Purpose of splitting data into Training/testing sets

\n#
    \n#
  • We built our model with the requirement that the model fit the data well.
  • \n#
  • As a side-effect, the model will fit THIS dataset well. What about new data?
  • \n#
      \n#
    • We wanted the model for predictions, right?
    • \n#
    \n#
  • One simple solution, leave out some data (for testing) and train the model on the rest
  • \n#
  • This also leads directly to the idea of cross-validation, next section.
  • \n#
\n#\n\n# First, we try a basic Logistic Regression:\n#\n# * Split the data into a training and test (hold-out) set\n# * Train on the training set, and test for accuracy on the testing set\n\n#%%\n\n# Split the data into a training and test set.\nXlr, Xtestlr, ylr, ytestlr = train_test_split(dflog[['Height', 'Weight']].values,\n (dflog.Gender == \"Male\").values, random_state=5)\n\nclf = LogisticRegression()\n# Fit the model on the trainng data.\nclf.fit(Xlr, ylr)\n# Print the accuracy from the testing data.\nprint(accuracy_score(clf.predict(Xtestlr), ytestlr))\n\n\n# ### Tuning the Model\n\n# The model has some hyperparameters we can tune for hopefully better performance. For tuning the parameters of your model, you will use a mix of *cross-validation* and *grid search*. In Logistic Regression, the most important parameter to tune is the *regularization parameter* $C$. Note that the regularization parameter is not always part of the logistic regression model.\n#\n# The regularization parameter is used to control for unlikely high regression coefficients, and in other cases can be used when data is sparse, as a method of feature selection.\n#\n# You will now implement some code to perform model tuning and selecting the regularization parameter $C$.\n\n# We use the following `cv_score` function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.\n\n#%%\n\ndef cv_score(clf, x, y, score_func=accuracy_score):\n result = 0\n nfold = 5\n # split data into train/test groups, 5 times\n for train, test in KFold(nfold).split(x):\n clf.fit(x[train], y[train]) # fit\n # evaluate score function on held-out data\n result += score_func(clf.predict(x[test]), y[test])\n return result / nfold # average\n\n\n# Below is an example of using the `cv_score` function for a basic logistic regression model without regularization.\n\n#%%\n\nclf = LogisticRegression()\nscore = cv_score(clf, Xlr, ylr)\nprint(score)\n\n\n#

Checkup Exercise Set II

\n#\n# Exercise: Implement the following search procedure to find a good model\n#
    \n#
  • You are given a list of possible values of `C` below\n#
  • For each C:\n#
      \n#
    1. Create a logistic regression model with that value of C\n#
    2. Find the average score for this model using the `cv_score` function **only on the training set** `(Xlr, ylr)`\n#
    \n#
  • Pick the C with the highest average score\n#
\n# Your goal is to find the best model parameters based *only* on the training set, without showing the model test set at all (which is why the test set is also called a *hold-out* set).\n#\n\n#%%\n\n# the grid of parameters to search over\nCs = [0.001, 0.1, 1, 10, 100]\n\n# your turn\nscores = []\nfor c in Cs:\n clf = LogisticRegression(penalty='l2', C=c)\n score = cv_score(clf, Xlr, ylr)\n scores.append(score)\n print('For C = {}, the cross-validation score is {}.'.format(c, score))\n\n\n# **Response:** Regularization applies a penalty for increasing the coefficient estimates in order to reduce overfitting. The regularization parameter $C$ in scikit-learn is the inverse of the shrinkage parameter $\\lambda$. Larger $\\lambda$ or smaller $C$ increases the shrinkage pentalty and shrinks the coefficient estimates toward zero. By default scikit-learn sets $C=1$ in logistic regression, so some amount of regularization is used even if $C$ is not specified. In the example the cross-validation score is the same for $C=0.1,1,10,100$.\n\n#

Checkup Exercise Set III

\n#\n# **Exercise:** Now you want to estimate how this model will predict on unseen data in the following way:\n#
    \n#
  1. Use the C you obtained from the procedure earlier and train a Logistic Regression on the training data\n#
  2. Calculate the accuracy on the test data\n#
\n#\n#

You may notice that this particular value of `C` may or may not do as well as simply running the default model on a random train-test split.

\n#\n#
    \n#
  • Do you think that's a problem?\n#
  • Why do we need to do this whole cross-validation and grid search stuff anyway?\n#
\n\n#%%\n\n# your turn\nclf = LogisticRegression(penalty='l2', C=0.1)\nclf.fit(Xlr, ylr)", "target_code": "print(accuracy_score(clf.predict(Xtestlr), ytestlr))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification\n#\n\n# **Note:** We've adapted this Mini Project from [Lab 5 in the CS109](https://github.com/cs109/2015lab5) course. Please feel free to check out the original lab, both for more exercises, as well as solutions.\n\n# We turn our attention to **classification**. Classification tries to predict, which of a small set of classes, an observation belongs to. Mathematically, the aim is to find $y$, a **label** based on knowing a feature vector $\\x$. For instance, consider predicting gender from seeing a person's face, something we do fairly well as humans. To have a machine do this well, we would typically feed the machine a bunch of images of people which have been labelled \"male\" or \"female\" (the training set), and have it learn the gender of the person in the image from the labels and the *features* used to determine gender. Then, given a new photo, the trained algorithm returns us the gender of the person in the photo.\n#\n# There are different ways of making classifications. One idea is shown schematically in the image below, where we find a line that divides \"things\" of two different types in a 2-dimensional feature space. The classification show in the figure below is an example of a maximum-margin classifier where construct a decision boundary that is far as possible away from both classes of points. The fact that a line can be drawn to separate the two classes makes the problem *linearly separable*. Support Vector Machines (SVM) are an example of a maximum-margin classifier.\n#\n# ![Splitting using a single line](images/onelinesplit.png)\n#\n#\n\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport sklearn.model_selection\nimport seaborn as sns\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.colors import ListedColormap\nimport matplotlib.cm as cm\nimport matplotlib as mpl\nimport scipy as sp\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\npd.set_option('display.width', 500)\npd.set_option('display.max_columns', 100)\npd.set_option('display.notebook_repr_html', True)\nsns.set_style(\"whitegrid\")\nsns.set_context(\"poster\")\n\nc0 = sns.color_palette()[0]\nc1 = sns.color_palette()[1]\nc2 = sns.color_palette()[2]\n\ncmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\ncmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\ncm = plt.cm.RdBu\ncm_bright = ListedColormap(['#FF0000', '#0000FF'])\n\n\ndef points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light,\n cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):\n h = .02\n X = np.concatenate((Xtr, Xte))\n x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),\n np.linspace(y_min, y_max, 100))\n\n # plt.figure(figsize=(10,6))\n if zfunc:\n p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]\n p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n Z = zfunc(p0, p1)\n else:\n Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n ZZ = Z.reshape(xx.shape)\n if mesh:\n plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)\n if predicted:\n showtr = clf.predict(Xtr)\n showte = clf.predict(Xte)\n else:\n showtr = ytr\n showte = yte\n ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold,\n s=psize, alpha=alpha, edgecolor=\"k\")\n # and testing points\n ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold,\n alpha=alpha, marker=\"s\", s=psize+10)\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n return ax, xx, yy\n\n\ndef points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light,\n cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):\n ax, xx, yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False,\n colorscale=colorscale, cdiscrete=cdiscrete,\n psize=psize, alpha=alpha, predicted=True)\n Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n Z = Z.reshape(xx.shape)\n plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)\n cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)\n plt.clabel(cs2, fmt='%2.1f', colors='k', fontsize=14, axes=ax)\n return ax\n\n\n# ## A Motivating Example Using `sklearn`: Heights and Weights\n\n# We'll use a dataset of heights and weights of males and females to hone our understanding of classifiers. We load the data into a dataframe and plot it.\n\n\ndflog = pd.read_csv(\"data/01_heights_weights_genders.csv\")\ndflog.head()\n\n\n# Remember that the form of data we will use always is\n#\n# ![dataform](images/data.png)\n#\n# with the \"response\" or \"label\" $y$ as a plain array of 0s and 1s for binary classification. Sometimes we will also see -1 and +1 instead. There are also *multiclass* classifiers that can assign an observation to one of $K > 2$ classes and the labe may then be an integer, but we will not be discussing those here.\n#\n# `y = [1,1,0,0,0,1,0,1,0....]`.\n\n#

Checkup Exercise Set I

\n#\n#
    \n#
  • Exercise: Create a scatter plot of Weight vs. Height\n#
  • Exercise: Color the points differently by Gender\n#
\n#\n\n\n# your turn\nplt.scatter(dflog.Weight, dflog.Height, alpha=0.5, s=5)\nplt.xlabel('Weight')\nplt.ylabel('Height')\nplt.show()\n\n\nsns.pairplot(data=dflog, hue='Gender', size=10, markers=['s', 'o'])\nplt.show()\n\n\n# ### Training and Test Datasets\n#\n# When fitting models, we would like to ensure two things:\n#\n# * We have found the best model (in terms of model parameters).\n# * The model is highly likely to generalize i.e. perform well on unseen data.\n#\n#
\n#

Purpose of splitting data into Training/testing sets

\n#
    \n#
  • We built our model with the requirement that the model fit the data well.
  • \n#
  • As a side-effect, the model will fit THIS dataset well. What about new data?
  • \n#
      \n#
    • We wanted the model for predictions, right?
    • \n#
    \n#
  • One simple solution, leave out some data (for testing) and train the model on the rest
  • \n#
  • This also leads directly to the idea of cross-validation, next section.
  • \n#
\n#\n\n# First, we try a basic Logistic Regression:\n#\n# * Split the data into a training and test (hold-out) set\n# * Train on the training set, and test for accuracy on the testing set\n\n\n# Split the data into a training and test set.\nXlr, Xtestlr, ylr, ytestlr = train_test_split(dflog[['Height', 'Weight']].values,\n (dflog.Gender == \"Male\").values, random_state=5)\n\nclf = LogisticRegression()\n# Fit the model on the trainng data.\nclf.fit(Xlr, ylr)\n# Print the accuracy from the testing data.\nprint(accuracy_score(clf.predict(Xtestlr), ytestlr))\n\n\n# ### Tuning the Model\n\n# The model has some hyperparameters we can tune for hopefully better performance. For tuning the parameters of your model, you will use a mix of *cross-validation* and *grid search*. In Logistic Regression, the most important parameter to tune is the *regularization parameter* $C$. Note that the regularization parameter is not always part of the logistic regression model.\n#\n# The regularization parameter is used to control for unlikely high regression coefficients, and in other cases can be used when data is sparse, as a method of feature selection.\n#\n# You will now implement some code to perform model tuning and selecting the regularization parameter $C$.\n\n# We use the following `cv_score` function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.\n\n\ndef cv_score(clf, x, y, score_func=accuracy_score):\n result = 0\n nfold = 5\n # split data into train/test groups, 5 times\n for train, test in KFold(nfold).split(x):\n clf.fit(x[train], y[train]) # fit\n # evaluate score function on held-out data\n result += score_func(clf.predict(x[test]), y[test])\n return result / nfold # average\n\n\n# Below is an example of using the `cv_score` function for a basic logistic regression model without regularization.\n\n\nclf = LogisticRegression()\nscore = cv_score(clf, Xlr, ylr)\nprint(score)\n\n\n#

Checkup Exercise Set II

\n#\n# Exercise: Implement the following search procedure to find a good model\n#
    \n#
  • You are given a list of possible values of `C` below\n#
  • For each C:\n#
      \n#
    1. Create a logistic regression model with that value of C\n#
    2. Find the average score for this model using the `cv_score` function **only on the training set** `(Xlr, ylr)`\n#
    \n#
  • Pick the C with the highest average score\n#
\n# Your goal is to find the best model parameters based *only* on the training set, without showing the model test set at all (which is why the test set is also called a *hold-out* set).\n#\n\n\n# the grid of parameters to search over\nCs = [0.001, 0.1, 1, 10, 100]\n\n# your turn\nscores = []\nfor c in Cs:\n clf = LogisticRegression(penalty='l2', C=c)\n score = cv_score(clf, Xlr, ylr)\n scores.append(score)\n print('For C = {}, the cross-validation score is {}.'.format(c, score))\n\n\n# **Response:** Regularization applies a penalty for increasing the coefficient estimates in order to reduce overfitting. The regularization parameter $C$ in scikit-learn is the inverse of the shrinkage parameter $\\lambda$. Larger $\\lambda$ or smaller $C$ increases the shrinkage pentalty and shrinks the coefficient estimates toward zero. By default scikit-learn sets $C=1$ in logistic regression, so some amount of regularization is used even if $C$ is not specified. In the example the cross-validation score is the same for $C=0.1,1,10,100$.\n\n#

Checkup Exercise Set III

\n#\n# **Exercise:** Now you want to estimate how this model will predict on unseen data in the following way:\n#
    \n#
  1. Use the C you obtained from the procedure earlier and train a Logistic Regression on the training data\n#
  2. Calculate the accuracy on the test data\n#
\n#\n#

You may notice that this particular value of `C` may or may not do as well as simply running the default model on a random train-test split.

\n#\n#
    \n#
  • Do you think that's a problem?\n#
  • Why do we need to do this whole cross-validation and grid search stuff anyway?\n#
\n\n\n# your turn\nclf = LogisticRegression(penalty='l2', C=0.1)\nclf.fit(Xlr, ylr)\n", "project_metadata": {"full_name": "andrewjsiu/Springboard-Coursework", "description": "Data Science Mini-Projects with Python", "topics": ["data-science"], "git_url": "git://github.com/andrewjsiu/Springboard-Coursework.git", "stars": 14, "watchers": 14, "forks": 4, "created": "2017-03-14T21:47:39Z", "size": 21701, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7256771, "Python": 3134}, "last_updated": "2021-01-10T05:37:09Z"}, "intent": "# Print the accuracy from the testing data."}, {"original_comment": "# let's separate into training and testing set\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Target guided encodings\n#\n# In the previous lectures in this section, we learned how to convert a label into a number, by using one hot encoding, replacing by a digit or replacing by frequency or counts of observations. These methods are simple, make (almost) no assumptions and work generally well in different scenarios.\n#\n# There are however methods that allow us to capture information while pre-processing the labels of categorical variables. These methods include:\n#\n# - Ordering the labels according to the target\n# - Replacing labels by the target mean (mean encoding / target encoding)\n# - Replacing the labels by the probability ratio of the target being 1 or 0\n# - Weight of evidence.\n#\n# All of the above methods have something in common:\n#\n# - the encoding is **guided by the target**, and\n# - they create a **monotonic relationship** between the variable and the target.\n#\n#\n# ### Monotonicity\n#\n# A monotonic relationship is a relationship that does one of the following:\n#\n# - (1) as the value of one variable increases, so does the value of the other variable; or\n# - (2) as the value of one variable increases, the value of the other variable decreases.\n#\n# In this case, as the value of the independent variable (predictor) increases, so does the target, or conversely, as the value of the variable increases, the target value decreases.\n#\n#\n#\n# ### Advantages of target guided encodings\n#\n# - Capture information within the category, therefore creating more predictive features\n# - Create a monotonic relationship between the variable and the target, therefore suitable for linear models\n# - Do not expand the feature space\n#\n#\n# ### Limitations\n#\n# - Prone to cause over-fitting\n# - Difficult to cross-validate with current libraries\n#\n#\n# ### Note\n#\n# The methods discussed in this and the coming 3 lectures can be also used on numerical variables, after discretisation. This creates a monotonic relationship between the numerical variable and the target, and therefore improves the performance of linear models. I will discuss this in more detail in the section \"Discretisation\".\n#\n# ===============================================================================\n#\n# ## Probability Ratio Encoding\n#\n# These encoding is suitable for classification problems only, where the target is binary.\n#\n# For each category, we calculate the mean of target=1, that is the probability of the target being 1 ( P(1) ), and the probability of the target=0 ( P(0) ). And then, we calculate the ratio P(1)/P(0), and replace the categories by that ratio.\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding with feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n#%%\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n#%%\n\n# let's remove obserrvations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n#%%\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n#%%\n\n# and we remove the observations where cabin = T\n# because they are too few\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n#%%\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n#%%\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n#%%\n\ndata['sex'].unique()\n\n#%%\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We calculate the ratio P(1)/P(0) using the train set, and then use those mappings in the test set.\n#\n# Note that to implement this in pandas, we need to keep the target in the training set.\n\n#%%\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n#%%\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Probability ratio encoding with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n\n#%%\n\n# let's calculate the probability of survived = 1 per category\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n\n# and capture it into a dataframe\nprob_df = pd.DataFrame(prob_df)\nprob_df\n\n#%%\n\n# and now the probability of survived = 0\n\nprob_df['died'] = 1 - prob_df['survived']\n\nprob_df\n\n#%%\n\n# and now the ratio\n\nprob_df['ratio'] = prob_df['survived'] / prob_df['died']\n\nprob_df\n\n#%%\n\n# and now let's capture the ratio in a dictionary\n\nordered_labels = prob_df['ratio'].to_dict()\n\nordered_labels\n\n#%%\n\n# now, we replace the labels with the ratios\n\nX_train['cabin'] = X_train['cabin'].map(ordered_labels)\nX_test['cabin'] = X_test['cabin'].map(ordered_labels)\n\n#%%\n\n# let's explore the result\n\nX_train['cabin'].head(10)\n\n#%%\n\n# we can turn the previous commands into 2 functions\n\n\ndef find_category_mappings(df, variable, target):\n\n tmp = pd.DataFrame(df.groupby([variable])[target].mean())\n\n tmp['non-target'] = 1 - tmp[target]\n\n tmp['ratio'] = tmp[target] / tmp['non-target']\n\n return tmp['ratio'].to_dict()\n\n\ndef integer_encode(train, test, variable, ordinal_mapping):\n\n X_train[variable] = X_train[variable].map(ordinal_mapping)\n X_test[variable] = X_test[variable].map(ordinal_mapping)\n\n#%%\n\n# and now we run a loop over the remaining categorical variables\n\nfor variable in ['sex', 'embarked']:\n\n mappings = find_category_mappings(X_train, variable, 'survived')\n\n integer_encode(X_train, X_test, variable, mappings)\n\n#%%\n\n# let's see the result\n\nX_train.head()\n\n#%%\n\n# let's inspect the newly created monotonic relationship\n# between the categorical variables and the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Monotonic relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survived')\n plt.show()\n\n\n# Note the monotonic relationships between the mean target and the categories.\n#\n# ### Note\n#\n# Replacing categorical labels with this code and method will generate missing values for categories present in the test set that were not seen in the training set. Therefore it is extremely important to handle rare labels before-hand. I will explain how to do this, in a later notebook.\n#\n# **In addition, it will create NA or Inf if the probability of target = 0 is zero, as the division by zero is not defined.**\n\n# ## Probability Ratio Encoding with Feature-Engine\n#\n# If using Feature-Engine, instead of pandas, we do not need to keep the target variable in the training dataset.\n\n#%%", "target_code": "X_train, X_test, y_train, y_test = train_test_split(\n data[['cabin', 'sex', 'embarked']], # predictors\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Target guided encodings\n#\n# In the previous lectures in this section, we learned how to convert a label into a number, by using one hot encoding, replacing by a digit or replacing by frequency or counts of observations. These methods are simple, make (almost) no assumptions and work generally well in different scenarios.\n#\n# There are however methods that allow us to capture information while pre-processing the labels of categorical variables. These methods include:\n#\n# - Ordering the labels according to the target\n# - Replacing labels by the target mean (mean encoding / target encoding)\n# - Replacing the labels by the probability ratio of the target being 1 or 0\n# - Weight of evidence.\n#\n# All of the above methods have something in common:\n#\n# - the encoding is **guided by the target**, and\n# - they create a **monotonic relationship** between the variable and the target.\n#\n#\n# ### Monotonicity\n#\n# A monotonic relationship is a relationship that does one of the following:\n#\n# - (1) as the value of one variable increases, so does the value of the other variable; or\n# - (2) as the value of one variable increases, the value of the other variable decreases.\n#\n# In this case, as the value of the independent variable (predictor) increases, so does the target, or conversely, as the value of the variable increases, the target value decreases.\n#\n#\n#\n# ### Advantages of target guided encodings\n#\n# - Capture information within the category, therefore creating more predictive features\n# - Create a monotonic relationship between the variable and the target, therefore suitable for linear models\n# - Do not expand the feature space\n#\n#\n# ### Limitations\n#\n# - Prone to cause over-fitting\n# - Difficult to cross-validate with current libraries\n#\n#\n# ### Note\n#\n# The methods discussed in this and the coming 3 lectures can be also used on numerical variables, after discretisation. This creates a monotonic relationship between the numerical variable and the target, and therefore improves the performance of linear models. I will discuss this in more detail in the section \"Discretisation\".\n#\n# ===============================================================================\n#\n# ## Probability Ratio Encoding\n#\n# These encoding is suitable for classification problems only, where the target is binary.\n#\n# For each category, we calculate the mean of target=1, that is the probability of the target being 1 ( P(1) ), and the probability of the target=0 ( P(0) ). And then, we calculate the ratio P(1)/P(0), and replace the categories by that ratio.\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding with feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n\n# let's remove obserrvations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n\n# and we remove the observations where cabin = T\n# because they are too few\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n\ndata['sex'].unique()\n\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We calculate the ratio P(1)/P(0) using the train set, and then use those mappings in the test set.\n#\n# Note that to implement this in pandas, we need to keep the target in the training set.\n\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Probability ratio encoding with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n\n\n# let's calculate the probability of survived = 1 per category\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n\n# and capture it into a dataframe\nprob_df = pd.DataFrame(prob_df)\nprob_df\n\n\n# and now the probability of survived = 0\n\nprob_df['died'] = 1 - prob_df['survived']\n\nprob_df\n\n\n# and now the ratio\n\nprob_df['ratio'] = prob_df['survived'] / prob_df['died']\n\nprob_df\n\n\n# and now let's capture the ratio in a dictionary\n\nordered_labels = prob_df['ratio'].to_dict()\n\nordered_labels\n\n\n# now, we replace the labels with the ratios\n\nX_train['cabin'] = X_train['cabin'].map(ordered_labels)\nX_test['cabin'] = X_test['cabin'].map(ordered_labels)\n\n\n# let's explore the result\n\nX_train['cabin'].head(10)\n\n\n# we can turn the previous commands into 2 functions\n\n\ndef find_category_mappings(df, variable, target):\n\n tmp = pd.DataFrame(df.groupby([variable])[target].mean())\n\n tmp['non-target'] = 1 - tmp[target]\n\n tmp['ratio'] = tmp[target] / tmp['non-target']\n\n return tmp['ratio'].to_dict()\n\n\ndef integer_encode(train, test, variable, ordinal_mapping):\n\n X_train[variable] = X_train[variable].map(ordinal_mapping)\n X_test[variable] = X_test[variable].map(ordinal_mapping)\n\n\n# and now we run a loop over the remaining categorical variables\n\nfor variable in ['sex', 'embarked']:\n\n mappings = find_category_mappings(X_train, variable, 'survived')\n\n integer_encode(X_train, X_test, variable, mappings)\n\n\n# let's see the result\n\nX_train.head()\n\n\n# let's inspect the newly created monotonic relationship\n# between the categorical variables and the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Monotonic relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survived')\n plt.show()\n\n\n# Note the monotonic relationships between the mean target and the categories.\n#\n# ### Note\n#\n# Replacing categorical labels with this code and method will generate missing values for categories present in the test set that were not seen in the training set. Therefore it is extremely important to handle rare labels before-hand. I will explain how to do this, in a later notebook.\n#\n# **In addition, it will create NA or Inf if the probability of target = 0 is zero, as the division by zero is not defined.**\n\n# ## Probability Ratio Encoding with Feature-Engine\n#\n# If using Feature-Engine, instead of pandas, we do not need to keep the target variable in the training dataset.\n\n\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "intent": "# separate into training and 30% testing set"}, {"original_comment": "# Mean Squared Error loss function\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# (Adapted from my solutions to homework problems for ENM531: Data Driven Modelling at UPenn)\n\n#%%\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import confusion_matrix\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport pyDOE # for Latin Hypercube Sampling\n\n\n# # Binary classification using logistic regression model and neural network for reduced-dimensionality image dataset\n\n#%%\n\n# Custom function definitions\n\n# Glorot initialization of weight matrix\ndef glorot_init_mat(shape):\n din = shape[0]\n dout = shape[1]\n var = torch.tensor([2.0/(din+dout)])\n std = torch.sqrt(var)\n mean = torch.tensor([0.0])\n dist = torch.distributions.normal.Normal(mean, std)\n return dist.sample(shape)", "target_code": "def MSE_loss(Ypred, Ytrue):\n return torch.mean((Ytrue - Ypred)**2)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# (Adapted from my solutions to homework problems for ENM531: Data Driven Modelling at UPenn)\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import confusion_matrix\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport pyDOE # for Latin Hypercube Sampling\n\n\n# # Binary classification using logistic regression model and neural network for reduced-dimensionality image dataset\n\n\n# Custom function definitions\n\n# Glorot initialization of weight matrix\ndef glorot_init_mat(shape):\n din = shape[0]\n dout = shape[1]\n var = torch.tensor([2.0/(din+dout)])\n std = torch.sqrt(var)\n mean = torch.tensor([0.0])\n dist = torch.distributions.normal.Normal(mean, std)\n return dist.sample(shape)\n\n\n", "project_metadata": {"full_name": "apallath/basic_dl", "description": "Deep learning models in PyTorch", "topics": [], "git_url": "git://github.com/apallath/basic_dl.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-04-26T23:47:05Z", "size": 100322, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1139747, "Python": 46387}, "last_updated": "2020-12-25T23:23:01Z"}, "intent": "# Mean Squared Error loss function"}, {"original_comment": "# # Covariance\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport sklearn.covariance\nimport sklearn.decomposition\nfrom sklearn.preprocessing import (\n RobustScaler\n)\nfrom sklearn.neighbors import (\n KernelDensity,\n KDTree,\n)\nimport sklearn.neighbors\nimport time\nimport itertools\nimport math\nimport pandas as pd\nimport timeit\nimport matplotlib.pyplot as plt\nimport matplotlib\nimport scipy.stats\nimport sklearn\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ndf = pd.read_csv(\n \"../rawdata/mnist_train.csv\",\n header=None,\n).rename(\n columns={0: \"label\"}\n)\n\n#%%\n\nnp.random.seed(0)\nnonzeroes = df[df.label > 0].values[:, 1:785]\nzeroes = df[df.label == 0].values[:, 1:785]\nnp.random.shuffle(nonzeroes)\nnp.random.shuffle(zeroes)\nsz = zeroes[:50]\nsnz = nonzeroes[:50]\nprint(len(nonzeroes))\nprint(len(zeroes))\n\ncombined = np.vstack([zeroes, snz])\nprint(len(combined))\nreal_outliers = np.concatenate([\n np.zeros(len(zeroes), dtype=bool),\n np.ones(len(snz), dtype=bool)\n])\n\n#%%\n\ndef estimate_kde_bw(data):\n q3 = np.percentile(data, 75, axis=0)\n q1 = np.percentile(data, 25, axis=0)\n iqr = q3 - q1\n bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))\n return bw\n\n#%%\n\ndef eval_scores(scores, true_outliers):\n cutoff = np.percentile(scores, 99)\n outliers = scores > cutoff\n both = sum(np.logical_and(outliers, true_outliers))\n predict = sum(outliers)\n actual = sum(true_outliers)\n print(\"{}/{},{}\".format(both, predict, actual))\n return outliers\n\n#%%\n\npca = sklearn.decomposition.PCA(\n n_components=15,\n random_state=0,\n svd_solver='randomized'\n).fit(df.values[:, 1:785])\n\n#%%\n\nrdata = pca.transform(combined)\nrdata.shape", "target_code": "mcd = sklearn.covariance.EmpiricalCovariance(\n)\nmcd.fit(rdata)\nmcd_scores = mcd.mahalanobis(rdata)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport sklearn.covariance\nimport sklearn.decomposition\nfrom sklearn.preprocessing import (\n RobustScaler\n)\nfrom sklearn.neighbors import (\n KernelDensity,\n KDTree,\n)\nimport sklearn.neighbors\nimport time\nimport itertools\nimport math\nimport pandas as pd\nimport timeit\nimport matplotlib.pyplot as plt\nimport matplotlib\nimport scipy.stats\nimport sklearn\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ndf = pd.read_csv(\n \"../rawdata/mnist_train.csv\",\n header=None,\n).rename(\n columns={0: \"label\"}\n)\n\n\nnp.random.seed(0)\nnonzeroes = df[df.label > 0].values[:, 1:785]\nzeroes = df[df.label == 0].values[:, 1:785]\nnp.random.shuffle(nonzeroes)\nnp.random.shuffle(zeroes)\nsz = zeroes[:50]\nsnz = nonzeroes[:50]\nprint(len(nonzeroes))\nprint(len(zeroes))\n\ncombined = np.vstack([zeroes, snz])\nprint(len(combined))\nreal_outliers = np.concatenate([\n np.zeros(len(zeroes), dtype=bool),\n np.ones(len(snz), dtype=bool)\n])\n\n\ndef estimate_kde_bw(data):\n q3 = np.percentile(data, 75, axis=0)\n q1 = np.percentile(data, 25, axis=0)\n iqr = q3 - q1\n bw = iqr * (data.shape[0])**(-1.0/(data.shape[1]+4))\n return bw\n\n\ndef eval_scores(scores, true_outliers):\n cutoff = np.percentile(scores, 99)\n outliers = scores > cutoff\n both = sum(np.logical_and(outliers, true_outliers))\n predict = sum(outliers)\n actual = sum(true_outliers)\n print(\"{}/{},{}\".format(both, predict, actual))\n return outliers\n\n\npca = sklearn.decomposition.PCA(\n n_components=15,\n random_state=0,\n svd_solver='randomized'\n).fit(df.values[:, 1:785])\n\n\nrdata = pca.transform(combined)\nrdata.shape\n\n\n\n", "project_metadata": {"full_name": "stanford-futuredata/tKDC", "description": "Repository for tKDE Experiments", "topics": [], "git_url": "git://github.com/stanford-futuredata/tKDC.git", "stars": 6, "watchers": 6, "forks": 2, "created": "2016-09-05T21:57:14Z", "size": 11233, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11527710, "Java": 124843, "Python": 66371, "Shell": 17742, "R": 8114}, "last_updated": "2020-03-02T08:30:10Z"}, "intent": "# compute Covariance"}, {"original_comment": "# ## Create the GridSearchCV object: cv\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Ridge Regression With Python\n\n# ## import libraries\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.linear_model import Ridge\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nfrom pandas import DataFrame\nimport numpy as np\n\n\n# ## read data\n\n#%%\n\ndf_train = pd.read_csv('../input/train.csv')\ndf_test = pd.read_csv('../input/test.csv')\n\n\n# ## store Ids of homes\n\n#%%\n\ndf_train = df_train.drop('Id', axis=1)\ny_id = df_test['Id'].copy()\ndf_test = df_test.drop('Id', axis=1)\n\n\n# ## Define y_train\n\n#%%\n\ny_train = df_train['SalePrice'].values.reshape(-1, 1)\ndf_train = df_train.drop('SalePrice', axis=1)\n\n\n# ## Transform y_train to match the evaluation metric\n\n#%%\n\ny_train = np.log(y_train+1)\n\n\n# ## concate df_train and df_test\n\n#%%\n\ndf = pd.concat([df_train, df_test], axis=0, ignore_index=True)\n\n\n# ## select columns with non null values\n\n#%%\n\ndf = df.dropna(axis=1)\n\n\n# ## Transform categorical variables into dummy variables\n\n#%%\n\ndf = pd.get_dummies(df, drop_first=True)\n\n\n# ## create X_train and X_test\n\n#%%\n\nX_train = df.iloc[:df_train.shape[0], ]\nX_test = df.iloc[df_train.shape[0]:, ]\n\n\n# ## import libraries\n\n#%%\n\n# ## steps\n\n#%%\n\nsteps = [('scaler', StandardScaler()),\n ('ridge', Ridge())]\n\n\n# ## Create the pipeline: pipeline\n\n#%%\n\npipeline = Pipeline(steps)\n\n\n# ## Specify the hyperparameter space\n\n#%%\n\nparameters = {'ridge__alpha': np.logspace(-4, 0, 50)}\n\n\n#", "target_code": "from sklearn.model_selection import GridSearchCV\n\ncv = GridSearchCV(pipeline, parameters, cv=3)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Ridge Regression With Python\n\n# ## import libraries\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import Ridge\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nfrom pandas import DataFrame\nimport numpy as np\n\n\n# ## read data\n\n\ndf_train = pd.read_csv('../input/train.csv')\ndf_test = pd.read_csv('../input/test.csv')\n\n\n# ## store Ids of homes\n\n\ndf_train = df_train.drop('Id', axis=1)\ny_id = df_test['Id'].copy()\ndf_test = df_test.drop('Id', axis=1)\n\n\n# ## Define y_train\n\n\ny_train = df_train['SalePrice'].values.reshape(-1, 1)\ndf_train = df_train.drop('SalePrice', axis=1)\n\n\n# ## Transform y_train to match the evaluation metric\n\n\ny_train = np.log(y_train+1)\n\n\n# ## concate df_train and df_test\n\n\ndf = pd.concat([df_train, df_test], axis=0, ignore_index=True)\n\n\n# ## select columns with non null values\n\n\ndf = df.dropna(axis=1)\n\n\n# ## Transform categorical variables into dummy variables\n\n\ndf = pd.get_dummies(df, drop_first=True)\n\n\n# ## create X_train and X_test\n\n\nX_train = df.iloc[:df_train.shape[0], ]\nX_test = df.iloc[df_train.shape[0]:, ]\n\n\n# ## import libraries\n\n\n# ## steps\n\n\nsteps = [('scaler', StandardScaler()),\n ('ridge', Ridge())]\n\n\n# ## Create the pipeline: pipeline\n\n\npipeline = Pipeline(steps)\n\n\n# ## Specify the hyperparameter space\n\n\nparameters = {'ridge__alpha': np.logspace(-4, 0, 50)}\n\n\n#\n\n\n\n", "project_metadata": {"full_name": "harunurrashid97/Regression-with-python-and-R", "description": "Regression with Python & R.", "topics": ["machine-learning-algorithms", "regression-models", "python3", "r", "pyplot", "matplotlib", "correlation-coefficient", "correlation", "regression", "linear-regression", "python", "plot", "regression-algorithms", "regression-analysis"], "git_url": "git://github.com/harunurrashid97/Regression-with-python-and-R.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2018-03-09T22:04:46Z", "size": 1918, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 285050, "R": 11619, "Python": 2074}, "last_updated": "2019-11-18T13:21:59Z"}, "intent": "# Create the GridSearchCV object: cv"}, {"original_comment": "# Construct rbf kernel object with .5 gamma\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Kernels\n# In this notebook we will compute the kernel (kernel as in kernel methods for non parametric regression, classification...) of a set of randomly sample set of 100 pixels.\n#\n# Sample pixels an get them into a `numpy` array for further checks.\n\n#%%\n\nfrom scipy.spatial.distance import pdist, squareform\nimport ee\nimport numpy as np\nfrom ee_ipl_uv import kernel\n\nee.Initialize()\n\n# Get some bands of an image\nim_original = ee.Image('LANDSAT/LC8_L1T_TOA/LC81980332015119LGN00')\nproperties = [\"B4\", \"B3\", \"B2\"]\nim = im_original.select(properties)\n\n# Sample 100 pixels of this 3 band image using seed 45. Resulting object is a featureCollection\nfeature_collection = im.sample(numPixels=100, seed=45)\n\n# We have 100 features with 4 properties each (the properties are the bands of the image plus the index)\nprint(\"Number of features: \", feature_collection.size().getInfo())\nprint(\"Properties: \", ee.Feature(\n feature_collection.first()).propertyNames().getInfo())\n\n# Construct kernel object\nkernel_lineal = kernel.Kernel(feature_collection, properties)\n\n# Convert local feature_collection to numpy array\narray = kernel_lineal.getNumpy()\n\nprint(\"The local numpy retrieved array has shape: \", array.shape)\n\n#%%\n\n# Select first element on the feature collection\ncurrent_array = ee.Feature(feature_collection.first()).toArray(properties)\n\nprint(current_array.getInfo())\n\n\n# ## Apply RBF kernel\n#\n# Let's try an [rbf](https://en.wikipedia.org/wiki/Radial_basis_function_kernel) kernel. Which is also implemented our `kernel` module.\n\n#%%", "target_code": "kernel_rbf = kernel.Kernel(\n feature_collection, properties, distancia=kernel.RBFDistance(.5))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Kernels\n# In this notebook we will compute the kernel (kernel as in kernel methods for non parametric regression, classification...) of a set of randomly sample set of 100 pixels.\n#\n# Sample pixels an get them into a `numpy` array for further checks.\n\n\nfrom scipy.spatial.distance import pdist, squareform\nimport ee\nimport numpy as np\nfrom ee_ipl_uv import kernel\n\nee.Initialize()\n\n# Get some bands of an image\nim_original = ee.Image('LANDSAT/LC8_L1T_TOA/LC81980332015119LGN00')\nproperties = [\"B4\", \"B3\", \"B2\"]\nim = im_original.select(properties)\n\n# Sample 100 pixels of this 3 band image using seed 45. Resulting object is a featureCollection\nfeature_collection = im.sample(numPixels=100, seed=45)\n\n# We have 100 features with 4 properties each (the properties are the bands of the image plus the index)\nprint(\"Number of features: \", feature_collection.size().getInfo())\nprint(\"Properties: \", ee.Feature(\n feature_collection.first()).propertyNames().getInfo())\n\n# Construct kernel object\nkernel_lineal = kernel.Kernel(feature_collection, properties)\n\n# Convert local feature_collection to numpy array\narray = kernel_lineal.getNumpy()\n\nprint(\"The local numpy retrieved array has shape: \", array.shape)\n\n\n# Select first element on the feature collection\ncurrent_array = ee.Feature(feature_collection.first()).toArray(properties)\n\nprint(current_array.getInfo())\n\n\n# ## Apply RBF kernel\n#\n# Let's try an [rbf](https://en.wikipedia.org/wiki/Radial_basis_function_kernel) kernel. Which is also implemented our `kernel` module.\n\n", "project_metadata": {"full_name": "IPL-UV/ee_ipl_uv", "description": "Multitemporal Cloud Masking in the Google Earth Engine", "topics": [], "git_url": "git://github.com/IPL-UV/ee_ipl_uv.git", "stars": 31, "watchers": 31, "forks": 7, "created": "2018-04-26T14:12:43Z", "size": 18065, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3105357, "Python": 102282}, "last_updated": "2020-12-29T08:43:11Z"}, "intent": "# Construct rbf kernel object with .5 gamma"}, {"original_comment": "# Use the 'Seaborn' plotting style in all subsequent visualisations:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')", "target_code": "import matplotlib.pyplot as plt\n\nplt.style.use('seaborn')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "intent": "# Use the 'Seaborn' plotting style in all subsequent visualisations:"}, {"original_comment": "# plot LWE thickness on secondary axis\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#\n# \n#\n# # __Amazon river freshwater discharge impacts on coastal waters:__\n#\n# ## __Hands-on tutorial of AWS in-region access of NASA Earthdata products__\n#\n#\n# This notebook provides a basic end-to-end workflow to interact with data \"in-place\" from the NASA Earthdata Cloud, by accessing AWS S3 locations provided by [NASA Harmony](http://harmony.earthdata.nasa.gov/) outputs without the need to download data. While these outputs can be downloaded locally, the cloud offers the ability to scale compute resources to perform analyses over large areas and time spans, which is critical as data volumes continue to grow.\n#\n# This workflow combines search, discovery, access, reformatting, basic analyses, and plotting components presented during Part-II. Though the example we're working with in this notebook only focuses on a small time and area to account for a large number of concurrent processing requests, this workflow can be modified and scaled up to suit a larger time range and region of interest.\n#\n# #### Learning objectives:\n#\n# - Understand the Pangeo BinderHub environment used during the workshop and how to execute code within a Jupyter Notebook\n# - Search for Liquid Water Equivalent (LWE) data from GRACE/GRACE-FO and Sea Surface Salinity (SSS) from SMAP\n# - Execute programmatic data access queries, plotting, and direct in-region cloud access using open source Python libraries.\n# - Access data in Zarr format from Earthdata Cloud (AWS)\n# - Subset both, plot and compare coincident data.\n# - Identify resources, including the Earthdata Cloud Primer, for getting started with Amazon Web Services outside of the Workshop to access and work with data with a cloud environment.\n#\n\n# ___\n#\n#

\n# \n# \n#\n#

\n#\n#\n#\n#\n# ### __Pangeo BinderHub and Project Jupyter__\n#\n# First, some basics on the Pangeo compute environment used during the live workshop and how to interact with Jupyter Notebooks and the Jupyter Lab interface.\n#\n# * [Pangeo BinderHub](https://binder.pangeo.io/): A multi-user server for interactive data analysis. This Hub is running in the AWS `us-west-2` region, which is where all Earthdata Cloud data and transformation service outputs are located. Pangeo is supported, in part, by the National Science Foundation (NSF) and the National Aeronautics and Space Administration (NASA). Google provided compute credits on Google Compute Engine. The Pangeo community promotes open, reproducible, and scalable science. We thank you for supporting this AGU Workshop.\n#\n# **This Hub is only supported during the live AGU workshop**. See instructions at the bottom of this notebook for how to set up your own AWS EC2 instance so that you can perform the same cloud access within your personal AWS environment.\n#\n# * [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/latest/): Interactive, reproducible, open source, and exploratory browser integrated computing environment.\n# * [JupyterLab](https://github.com/jupyterlab/jupyterlab): Web-based integrated IDE for computational workflows.\n\n# ### __Jupyter Notebook Basics__\n#\n# The body of a notebook is composed of cells. Each cell contains either markdown, code input, code output, or raw text. Cells can be included in any order and edited and executed at-will.\n#\n# **Markdown cells** - These are used to build a nicely formatted narrative around the code in the document.\n#\n# **Code cells** - These are used to define the computational code in the document. They come in two forms: the input cell where the user types the code to be executed, and the output cell which is the representation of the executed code.\n#\n# **Raw cells** - These are used when text needs to be included in raw form, without execution or transformation.\n#\n# #### Execute a cell or selected cells by pressing shift + enter\n#\n\n#%%\n\nimport json\nfrom IPython.display import HTML\nimport cartopy\nimport cartopy.crs as ccrs\nimport matplotlib.animation as animation\nimport xarray as xr\nimport pandas as pd\nimport numpy as np\nimport requests\nimport time\nimport dask.array as da\nimport matplotlib.pyplot as plt\nimport zarr\nimport rasterio\nimport s3fs\nimport intake\nfrom pprint import pprint\nfrom os.path import join, expanduser\nfrom http.cookiejar import CookieJar\nfrom urllib import request\nfrom getpass import getpass\nfrom platform import system\nfrom netrc import netrc\nprint('Hello World!')\n\n\n# #### Collapse a cell or cell output by clicking on the blue line to the left of the cell\n#\n# The cell content is replaced by three dots, indicating that the cell is collapsed.\n#\n# #### Execute multiple cells or run the entire notebook\n# Select cells with **shift + Up** or **shift + Down** and then execute selection with **shift + enter**.\n#\n# #### Run the whole notebook in a single step by clicking on the menu Run -> Run All Cells.\n#\n# See https://jupyter.readthedocs.io/en/latest/running.html for more guidance on running notebooks.\n\n# ___\n# ### __Import modules__\n#\n# The Python ecosystem is organized into modules. A module must be imported before the contents of that modules can be used. It is good practice to import modules in the first code cell of a notebook or at the top of your script. Not only does this make it clear which modules are being used, but it also ensures that the code fails at the beginning because one of the modules is not installed rather half way through after crunching a load of data.\n#\n# For some modules, it is common practice to shorten the module names according to accepted conventions. For example, the plotting module `matplotlib.pyplot` is shortened to `plt`. It is best to stick to these conventions rather than making up your own short names so that people reading your code see immediately what you are doing.\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## __Earthdata Login__\n#\n# An Earthdata Login account is required to access data, as well as discover restricted data, from the NASA Earthdata system. Please visit https://urs.earthdata.nasa.gov to register and manage your Earthdata Login account. This account is free to create and only takes a moment to set up.\n#\n# At this point in time (as we are still transitioning to a cloud environment), in order to access data from the Earthdata Cloud, you need special, early access, persmissions. For the workshop today, you have already been added to the list and the Earthdata login you provided prior to the workshop has been granted this access.\n#\n# The `setup_earthdata_login_auth` function will allow Python scripts to log into any Earthdata Login application programmatically. To avoid being prompted for\n# credentials every time you run and also allow clients such as curl to log in, you can add the following\n# to a `.netrc` (`_netrc` on Windows) file in your home directory:\n#\n# ```\n# machine urs.earthdata.nasa.gov\n# login \n# password \n# ```\n#\n# Make sure that this file is only readable by the current user or you will receive an error stating\n# \"netrc access too permissive.\"\n#\n# `$ chmod 0600 ~/.netrc`\n\n#%%\n\nTOKEN_DATA = (\"\"\n \"%s\"\n \"%s\"\n \"PODAAC CMR Client\"\n \"%s\"\n \"\")\n\n\ndef setup_cmr_token_auth(endpoint: str = 'cmr.earthdata.nasa.gov'):\n ip = requests.get(\"https://ipinfo.io/ip\").text.strip()\n return requests.post(\n url=\"https://%s/legacy-services/rest/tokens\" % endpoint,\n data=TOKEN_DATA % (input(\"Username: \"), getpass(\"Password: \"), ip),\n headers={'Content-Type': 'application/xml',\n 'Accept': 'application/json'}\n ).json()['token']['id']\n\n\ndef setup_earthdata_login_auth(endpoint: str = 'urs.earthdata.nasa.gov'):\n netrc_name = \"_netrc\" if system() == \"Windows\" else \".netrc\"\n try:\n username, _, password = netrc(\n file=join(expanduser('~'), netrc_name)).authenticators(endpoint)\n except (FileNotFoundError, TypeError):\n print('Please provide your Earthdata Login credentials for access.')\n print('Your info will only be passed to %s and will not be exposed in Jupyter.' % (\n endpoint))\n username = input('Username: ')\n password = getpass('Password: ')\n manager = request.HTTPPasswordMgrWithDefaultRealm()\n manager.add_password(None, endpoint, username, password)\n auth = request.HTTPBasicAuthHandler(manager)\n jar = CookieJar()\n processor = request.HTTPCookieProcessor(jar)\n opener = request.build_opener(auth, processor)\n request.install_opener(opener)\n\n\n# Get your authentication token for searching restricted records in the CMR:\n_token = setup_cmr_token_auth(endpoint=\"cmr.earthdata.nasa.gov\")\n\n# Start authenticated session with URS to allow restricted data downloads:\nsetup_earthdata_login_auth(endpoint=\"urs.earthdata.nasa.gov\")\n\n\n# ___\n# ## __Data Search and Discovery__\n#\n# Background on the two datasets we're working with in this tutorial:\n#\n# [**JPL GRACE and GRACE-FO Mascon Ocean, Ice, and Hydrology Equivalent Water Height Coastal Resolution Improvement (CRI) Filtered Release 06 Version 02**](https://podaac.jpl.nasa.gov/dataset/TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2)\n#\n# Provides land water equivalent (LWE) thickness for observing seasonal changes in water storage around the river. When discharge is high, the change in water storage will increase, pointing to a wet season. This product provides gridded monthly global water storage/height anomalies in a single data file in netCDF format, and can be used for analysis for ocean, ice, and hydrology phenomena. Source data are from [GRACE](https://podaac.jpl.nasa.gov/GRACE) and [GRACE-FO](https://podaac.jpl.nasa.gov/GRACE-FO)\n#\n# [**RSS SMAP Level 3 Sea Surface Salinity Standard Mapped Image 8-Day Running Mean V4.0 Validated Dataset**](https://podaac.jpl.nasa.gov/SMAP?sections=data)\n#\n# Daily data files (NetCDF format) for this product are based on Sea Surface Salinity averages spanning an 8-day moving time window, gridded at 0.25 degree x 0.25 degree.\n\n# #### First, define the region of interest over the Amazon river basin and set the temporal range for the year 2019:\n\n#%%\n\n# Bounding Box spatial parameter in decimal degree 'W,S,E,N' format.\nbounding_box = '-52,-2,-43,6'\n\n# Each date in yyyy-MM-ddTHH:mm:ssZ format; date range in start,end format\ntemporal = '2019-01-01T00:00:00Z,2019-12-31T23:59:59Z'\n\n\n# #### Set up a nested dictionary with the two data products of interest\n#\n# Before we search programmatically using the [Common Metadata Repository (CMR)](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html), we can use NASA Earthdata Search to visualize file coverage over multiple data sets and to access the same data you will be working with below:\n# [Earthdata Search project](https://search.earthdata.nasa.gov/projects?p=!C1938032626-POCLOUD!C1940468263-POCLOUD!C1650311642-PODAAC!C1664148252-PODAAC&pg[1][v]=t&pg[1][gsk]=-start_date&pg[2][v]=t&pg[2][gsk]=-start_date&pg[3][v]=t&pg[4][v]=t&q=TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2&sb[0]=-52%2C-2%2C-43%2C6&m=2.00390625!-47.50048828125!6!1!0!0%2C2&qt=2019-04-01T00%3A00%3A00.000Z%2C2019-04-30T23%3A59%3A59.999Z&tl=1591244551!4!!)\n\n#%%\n\nsearch_parameters = {\n 'grace': {\n 'short_name': 'TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2',\n 'provider': 'POCLOUD',\n 'bounding_box': bounding_box,\n 'temporal': temporal,\n 'token': _token},\n 'smap': {\n 'short_name': 'SMAP_RSS_L3_SSS_SMI_8DAY-RUNNINGMEAN_V4',\n 'provider': 'POCLOUD',\n 'bounding_box': bounding_box,\n 'temporal': temporal,\n 'token': _token},\n}\n\n\n# #### Discover file number and file size\n#\n# Using CMR search, determine the number of files that exist over this time and area of interest, as well as the average size and total volume of those files.\n\n#%%\n\nsearch_url = \"https://cmr.earthdata.nasa.gov/search/granules\"\noutput_format = \"json\"\n\n# fn.search_granules(search_parameters[k], _token)\nfor k, v in search_parameters.items():\n parameters = {\n \"scroll\": \"true\",\n \"page_size\": 100,\n }\n\n response = requests.post(\n f\"{search_url}.{output_format}\", params=parameters, data=v)\n response.raise_for_status()\n\n hits = int(response.headers['CMR-Hits'])\n if hits > 0:\n print(f\"Found {hits} files\")\n results = json.loads(response.content)\n granules = []\n granules.extend(results['feed']['entry'])\n granule_sizes = [float(granule['granule_size'])\n for granule in granules]\n print(f\"The total size of all files is {sum(granule_sizes):.2f} MB\")\n else:\n print(\"Found no hits\")\n\n\n# #### Locate the data access URLs provided by the data product (collection) metadata:\n#\n# GRACE data:\n\n#%%\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/granules.umm_json\",\n params=search_parameters['grace'])\ngrace_gran = r.json()\nprint(\"files returned:\", grace_gran['hits'])\ngrace_gran['items'][0]['umm']['RelatedUrls']\n\n\n# and SMAP data:\n\n#%%\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/granules.umm_json\",\n params=search_parameters['smap'])\n\nsmap_gran = r.json()\nprint(\"files returned:\", smap_gran['hits'])\nsmap_gran['items'][0]['umm']['RelatedUrls']\n\n\n# ___\n# ## __Data Access from NASA Earthdata Harmony__\n#\n# As you have seen so far in this workshop, there are several different methodologies to search and access data. The URLs above can be located with the data granule (file) metadata, and pulled into a list that can then be bulk downloaded as you've seen in the previous tutorials.\n#\n# [Harmony](https://harmony.earthdata.nasa.gov/) is a growing effort across NASA EOSDIS community to provide a consistent method to access and perform data subsetting and transformation services for data in the Earthdata Cloud. These services are processed in the cloud, with data archived in the cloud, and outputs can be accessed by downloading to a local machine or through direct in-region access via Amazon Web Services. These next steps walk through this access method to access the data outputs directly within the AWS `us-west-2` region so that we can read the data into memory within this AWS hub environment.\n\n# #### __Get collection ID for Harmony__\n#\n# Harmony operates on an input Collection concept-id (a CMR construct). Here's how you identify the CMR concept-id for the data products of interest:\n\n#%%\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/collections.umm_json\",\n params=search_parameters['grace'])\n\ngrace_coll = r.json()\ngrace_coll['hits']\ngrace_coll_meta = grace_coll['items'][0]['meta']\ngrace_coll_id = grace_coll_meta['concept-id']\nprint('GRACE collection id:', grace_coll_id)\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/collections.umm_json\",\n params=search_parameters['smap'])\n\nsmap_coll = r.json()\nsmap_coll['hits']\n\nsmap_coll_meta = smap_coll['items'][0]['meta']\nsmap_coll_id = smap_coll_meta['concept-id']\nprint('SMAP collection id:', smap_coll_id)\n\n\n# #### __Request SMAP data reformatted to Zarr__:\n#\n# [Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. Zarr also enables you to store arrays in memory, on disk, inside a Zip file, or on S3. Harmony's Zarr Reformatter service will transform the SMAP data from its native NetCDF format to Zarr, to allow us to open and download/read just the data that we require for our Amazon Basin study area.\n#\n#\n# For our science use case, ideally we'd also want to request an entire year but since this can take a long time to process with many concurrent requests during this live workshop, we will just request a month of data for demonstration purposes.\n\n#%%\n\nharmony_root = 'https://harmony.earthdata.nasa.gov'\nharmony_params = {\n 'collection_id': smap_coll_id,\n 'ogc-api-coverages_version': '1.0.0',\n 'variable': 'all',\n 'lat': '(-2:6)',\n 'lon': '(-52:-43)',\n 'start': '2019-04-01T00:00:00Z',\n 'stop': '2019-04-30T23:59:59Z',\n 'format': 'application/x-zarr',\n}\n\nsmap_url = harmony_root + \\\n '/{collection_id}/ogc-api-coverages/{ogc-api-coverages_version}/collections/{variable}/coverage/rangeset?format={format}&subset=lat{lat}&subset=lon{lon}&subset=time(\"{start}\":\"{stop}\")'.format(\n **harmony_params)\nprint(smap_url)\n\n#%%\n\nsmap_response = request.urlopen(smap_url)\nsmap_results = smap_response.read()\nsmap_json = json.loads(smap_results)\nprint(json.dumps(smap_json, indent=2))\nsmap_jobId = smap_json['jobID']\n\n#%%\n\nsmap_job_url = f'https://harmony.earthdata.nasa.gov/jobs/{smap_jobId}'\n\nwhile True:\n loop_response = request.urlopen(smap_job_url)\n loop_results = loop_response.read()\n job_json = json.loads(loop_results)\n if job_json['status'] != 'running':\n break\n print(\n f\"# Job status is running. Progress is {job_json['progress']} %. Trying again.\")\n time.sleep(10)\n\nsmap_links = []\nif job_json['status'] == 'successful' and job_json['progress'] == 100:\n print(\"# Job progress is 100%. Links to job outputs are displayed below:\")\n smap_links = [link['href'] for link in job_json['links']]\n display(smap_links)\n\n#%%\n\nsmap_zarr_urls = smap_links[5:]\nsmap_zarr_urls\n\n\n# **Access credentials for the output zarr file**\n#\n# Credentials provided in the Harmony job response provide authenticated access to your staged S3 resources.\n#\n# Grab the credentials as a JSON string, load to a Python dictionary, and display their expiration date:\n\n#%%\n\nwith request.urlopen(f\"https://harmony.earthdata.nasa.gov/cloud-access\") as f:\n creds = json.loads(f.read())\n\ncreds['Expiration']\n\n\n# ### Open staged zarr files with *s3fs*\n#\n# We use the AWS `s3fs` package to get metadata about the zarr data store and read in the credentials we pulled from our Harmony job response:\n\n#%%\n\nzarr_fs = s3fs.S3FileSystem(\n key=creds['AccessKeyId'],\n secret=creds['SecretAccessKey'],\n token=creds['SessionToken'],\n client_kwargs={'region_name': 'us-west-2'},\n)\n\n\n# #### SMAP loaded into xarray\n#\n# `xarray` is a python package designed to work with multi-dimensional arrays. See the [xarray website](http://xarray.pydata.org/en/stable/) for more information. This next code block will take all of the month's worth of SMAP data we pulled above from Harmony into xarray in a single command.\n\n#%%\n\nsmap_zarr_urls = smap_links[5:]\nsmap_zarr_stores = [zarr_fs.get_mapper(\n root=u, check=False) for u in smap_zarr_urls]\nds_SMAP = xr.open_mfdataset(smap_zarr_stores, engine=\"zarr\")\n\nprint(ds_SMAP)\n\n\n# ____\n# ### __We'll come back to our SMAP data now that it's loaded into xarray.__\n#\n# **We'll now move into breakout rooms to work through the same data acces steps as above with the GRACE data.**\n#\n# As you run through each of the code blocks, worth within your groups to ask questions and discuss the workflow. Instructors will move in and out of breakout rooms to offer assistance.\n\n# #### __Request GRACE data reformatted to Zarr__:\n\n#%%\n\nharmony_root = 'https://harmony.earthdata.nasa.gov'\nharmony_params = {\n 'collection_id': grace_coll_id,\n 'ogc-api-coverages_version': '1.0.0',\n 'variable': 'all',\n 'lat': '(-2:6)',\n 'lon': '(-52:-43)',\n 'start': '2019-01-01T00:00:00Z',\n 'stop': '2019-12-31T23:59:59Z',\n 'format': 'application/x-zarr',\n}\n\ngrace_url = harmony_root + \\\n '/{collection_id}/ogc-api-coverages/{ogc-api-coverages_version}/collections/{variable}/coverage/rangeset?format={format}&subset=lat{lat}&subset=lon{lon}&subset=time(\"{start}\":\"{stop}\")'.format(\n **harmony_params)\nprint(grace_url)\n\n#%%\n\ngrace_response = request.urlopen(grace_url)\ngrace_results = grace_response.read()\ngrace_json = json.loads(grace_results)\nprint(json.dumps(grace_json, indent=2))\ngrace_jobId = grace_json['jobID']\n\n#%%\n\ngrace_job_url = f'https://harmony.earthdata.nasa.gov/jobs/{grace_jobId}'\n\nwhile True:\n loop_response = request.urlopen(grace_job_url)\n loop_results = loop_response.read()\n job_json = json.loads(loop_results)\n if job_json['status'] != 'running':\n break\n print(\n f\"# Job status is running. Progress is {job_json['progress']} %. Trying again.\")\n time.sleep(10)\n\ngrace_links = []\nif job_json['status'] == 'successful' and job_json['progress'] == 100:\n print(\"# Job progress is 100%. Links to job outputs are displayed below:\")\n grace_links = [link['href'] for link in job_json['links']]\n display(grace_links)\n\n\n# #### __Access urls for the output zarr files__\n#\n# The new zarr dataset is staged for us in an S3 bucket. The url is the last one in the list shown above.\n#\n# Select the url and display below:\n\n#%%\n\ngrace_zarr_urls = grace_links[-1]\ngrace_zarr_urls\n\n#%%\n\ngrace_zarr_store = zarr_fs.get_mapper(root=grace_zarr_urls, check=False)\ngrace_zarr_dataset = zarr.open(grace_zarr_store)\n\nprint(grace_zarr_dataset.tree())\n\n#%%\n\nprint(grace_zarr_dataset.lwe_thickness.info)\n\n\n# #### __Open staged zarr file with *xarray*__\n#\n# Read more about `xarray`'s zarr reader here: http://xarray.pydata.org/en/stable/generated/xarray.open_zarr.html\n#\n# This xarray method allows you to pull in the Zarr outputs from Harmony directly into an xarray dataset.\n#\n# Open the zarr dataset and print the dataset \"header\":\n\n#%%\n\nds_GRACE = xr.open_zarr(grace_zarr_store)\nprint(ds_GRACE)\n\n\n# **Subset by Latitude/Longitude**\n#\n# Once we have obtained all the data, to make processing quicker, we are going to subset datasets by latitude/longitude for the Amazon River estuary.\n#\n# Once we have obtained the GRACE-FO data, we should spatial subset the data to the minimal area covering the Amazon River estuary. This will reduce processing load and reduce cloud costs for the user.\n#\n# Make a GRACE-FO subset and display the min, max of the *lat* and *lon* variables:\n\n#%%\n\nsubset_GRACE = ds_GRACE.sel(lat=slice(-18, 10), lon=slice(275, 330))\nprint(subset_GRACE.lat.min().data,\n subset_GRACE.lat.max().data,\n subset_GRACE.lon.min().data,\n subset_GRACE.lon.max().data)\n\n\n# **Select the variable for Land Water Equivalent Thickness (*lwe_thickness*)**\n#\n# Grab the land water equivalent thickness variable from the GRACE subset:\n\n#%%\n\nlwe = subset_GRACE.lwe_thickness\nprint(lwe)\n\n\n# ### __Return to the main room__\n#\n# Now that the GRACE data have also been loaded into xarray and subsetted, we'll return to the main room to begin our plotting and analysis.\n# ____\n\n# ## __Plotting and analysis__\n\n# __First we'll plot and animate the GRACE data over time, and then we'll subset and plot the SMAP data.__\n#\n# Now that we have a time/space slice of interest from the GRACE collection, let's plot the first time step, to see what we've got (or to see what it looks like. First we'll define two functions to make the plotting (and next animation step) a bit more convenient:\n\n#%%\n\ndef setup_map(ax, pmap, ds_subset, x, y, var, t, cmap, levels, extent):\n title = str(pd.to_datetime(ds_subset.time[t].values))\n pmap.set_title(title, fontsize=14)\n pmap.coastlines()\n pmap.set_extent(extent)\n pmap.add_feature(cartopy.feature.RIVERS)\n variable_desired = var[t, :, :]\n cont = pmap.contourf(x, y, variable_desired,\n cmap=cmap, levels=levels, zorder=1)\n return cont\n\n\ndef animate_ts(framenumber, ax, pmap, ds_subset, x, y, var, t, cmap, levels, extent):\n cont = setup_map(ax, pmap, ds_subset, x, y, var, t +\n framenumber, cmap, levels, extent)\n return cont\n\n#%%\n\n# Initialize a matplotlib plot object and add subplot:\nfig = plt.figure(figsize=[13, 9])\nax = fig.add_subplot(1, 1, 1)\n\n# Configure axes to display projected data using PlateCarree crs:\npmap = plt.axes(projection=ccrs.PlateCarree())\n\n# Get arrays of x and y to label the plot axes:\nx, y = np.meshgrid(subset_GRACE.lon, subset_GRACE.lat)\n\n# Set a few constants for plotting the GRACE-FO time series:\ntime_start = 168\ncmap_name = \"bwr_r\"\ncmap_levels = np.linspace(-100., 100., 14)\nmap_extent = [-85, -30, -16, 11]\n\n# Plot the first timestep:\ncont = setup_map(ax, pmap, subset_GRACE, x, y, lwe,\n time_start, cmap_name, cmap_levels, map_extent)\n\nfig.colorbar(cont, ticks=cmap_levels, orientation='horizontal',\n label='Land Water Equivalent Thickness (cm)')\n\n\n# #### Animate changes over time\n#\n# Let's now explore how land water mass changes throughout the year 2019, by creating an animation of GRACE monthly land water equivalent (LWE) maps over the Amazon River.\n#\n# Plot all the 2019 timesteps sequentially to create an animation of land water equivalent thickness for the Amazon Rainforest territories:\n\n#%%\n\nani = animation.FuncAnimation(fig, animate_ts, frames=range(0, 12), fargs=(\n ax, pmap, subset_GRACE, x, y, lwe, time_start, cmap_name, cmap_levels, map_extent\n), interval=500)\n\nHTML(ani.to_html5_video())\n\n\n# User note: You will need to install 'ffmpeg' in the cmd prompt to save the .mpg to disk. Use the following command to install from the conda-forge channel:\n#\n# ```shell\n# conda install -c conda-forge ffmpeg\n# ```\n#\n# Uncomment, run the next cell to save the animation to MP4:\n\n#%%\n\n#ani.save(\"earthdatacloud_animation_GRACEFO.mp4\", writer=animation.FFMpegWriter())\n\n\n# ### __SMAP Data Analysis__\n#\n# Normally, we'd pull the entire 2019 data for the SMAP collections as well (recall we only requested 1 month of data earlier in the notebook, due to current system constraints), average the daily SMAP data to monthly means, and compare with the monthly GRACE in a 2019 monthly time series.\n#\n# For now, let's create the building blocks for that workflow, by creating the 1 monthly mean for SMAP, plotting a spatial map of the SMAP SSS monthly mean to take a quick look at that data, and setting up the 2019 monthly time series plotting (without completing here).\n\n# Frist, we can subset the SMAP data as we did above with GRACE. Note the coordinate syntax differs slightly as noted below.\n\n#%%\n\n# SMAP\n# switched lat directions from GRACE, and longitude has positives and negatives\nlat_bnds, lon_bnds = [-2, 6], [180-52, 180-43]\nds_SMAP_subset = ds_SMAP.sel(lat=slice(*lat_bnds), lon=slice(*lon_bnds))\nprint(ds_SMAP_subset.lat.min().data,\n ds_SMAP_subset.lat.max().data,\n ds_SMAP_subset.lon.min().data,\n ds_SMAP_subset.lon.max().data,)\nds_SMAP_subset\n\n\n# **Important**: remember to use `compute()` so that the `scale_factor` is applied!\n#\n# We'll also replace fill values with numpy.nan for our next monthly mean computation step.\n\n#%%\n\n# Scale the values to their valid range (compute):\nsss_data = ds_SMAP_subset.sss_smap.data.compute()\n\n# Replace fills with numpy nan:\nsss_data[sss_data == -9999.] = np.nan\n\n# Replace the data array for the sss monthly mean variable:\nds_SMAP_subset.sss_smap.data = sss_data\n\nprint(ds_SMAP_subset.sss_smap)\n\n\n# Now compute monthly means using the convenient xarray `groupby` method and view the minimum and maximum monthly mean values.\n\n#%%\n\nds_SMAP_subset_sss_momean = ds_SMAP_subset.sss_smap.groupby(\n \"time.month\").mean(skipna=True)\nds_SMAP_subset_sss_momean.min(\n skipna=True), ds_SMAP_subset_sss_momean.max(skipna=True)\n\n\n# #### __Plot the April mean SMAP SSS__\n#\n# Creating a basic plot is easy using the built-in plotting capabilities of xarray:\n\n#%%\n\n# plot SMAP subset\nds_SMAP_subset_sss_momean.sel(month=4).plot()\nplt.show()\n\n#%%\n\nsss_min, sss_max = [ds_SMAP_subset.sss_smap.min(skipna=True).compute(\n).data.item(), ds_SMAP_subset.sss_smap.max(skipna=True).compute().data.item()]\nsss_min, sss_max\n\n#%%\n\n# A new figure window\nfig = plt.figure(figsize=[10, 8])\nax = fig.add_subplot(1, 1, 1) # specify (nrows, ncols, axnum)\npmap = plt.axes(projection=ccrs.PlateCarree())\n\n# Necessary Variables for functions\nextent = [180-52, 180-43, -2, 6] # lat/lon extents of map\n# x, y lat/lon values for functions\nx, y = np.meshgrid(ds_SMAP_subset.lon, ds_SMAP_subset.lat)\n# number of levels for color differentiation\nlevels = np.linspace(sss_min, sss_max, 10)\ncmap = 'viridis' # color scheme\nt = 1 # time to start with\n# variable we will be subsetting from the GRACE-FO data\nvar = ds_SMAP_subset.sss_smap.compute()\n# Time of specific time step\ntitle = str(pd.to_datetime(ds_SMAP_subset.time[t].values))\n\n# Set up first time step\ncont = setup_map(ax, pmap, ds_SMAP_subset, x, y, var, t, cmap, levels, extent)\n\n# Make a color bar\nfig.colorbar(cont, cmap=cmap, boundaries=levels, ticks=levels,\n orientation='horizontal', label='Sea Surface Salinity (psu)')\n\n\n# #### Animate April mean SMAP SSS\n#\n# Like we did with GRACE, we can animate the changes for the single month of SMAP data to see how salinty changes over time.\n\n#%%\n\n# Create animation for April 2019 (change the frame range for different time periods)\nani = animation.FuncAnimation(fig, animate_ts, frames=range(0, ds_SMAP_subset.time.size-1), fargs=(\n ax, pmap, ds_SMAP_subset, x, y, var, t, cmap, levels, extent\n), interval=400)\n\nHTML(ani.to_html5_video())\n\n\n# ## __Tutorial Summary and Additional Resources__\n#\n# The building blocks are now in place to do a longer time series analysis across GRACE and SMAP data, to better understand the relationship between river discharge and sea surface salinity for impact assessment.\n#\n# To conclude, we've searched programmatically for data archived in the PO.DAAC Earthdata Cloud over a region and time period of interest, requested the data from the Harmony API, read the data directly into `xarray` from the staged s3 location within AWS `us-west-2` without having to pull the data down into local storage, and performed subsetting and plotting in preparation for a time series analysis.\n#\n# There are two more sections of resources below:\n#\n# 1. __Adding river height (pre-SWOT) data to our our analysis__\n# - Pre-SWOT data are available through the PO DAAC on-premise location, so you can continue your analysis with data both in and outside of the cloud using OPeNDAP.\n#\n# 2. __How to set up a Jupyter Notebook running in your own EC2 instance__\n# - This same workflow can be achieved outside of the Pangeo BinderHub within an EC2 instance running your personal AWS account. More information and instructions are available below.\n\n# ___\n#\n# #### __On-prem hydro data from Pre-SWOT MEaSUREs program__\n#\n# Data from [**PRESWOT_HYDRO_GRRATS_L2_DAILY_VIRTUAL_STATION_HEIGHTS_V2**](https://podaac.jpl.nasa.gov/dataset/PRESWOT_HYDRO_GRRATS_L2_DAILY_VIRTUAL_STATION_HEIGHTS_V2) are not currently available on the cloud, but we can access via the PO.DAAC's on-prem OPeNDAP service (Hyrax) instead.\n#\n# \n#\n# The guidebook explains the details of the Pre-SWOT MEaSUREs data: https://podaac-tools.jpl.nasa.gov/drive/files/allData/preswot_hydrology/L2/rivers/docs/GRRATS_user_handbookV2.pdf\n#\n# **Access URL for PO.DAAC on-prem OPeNDAP service**\n#\n# Identify an appropriate OPeNDAP endpoint through the following steps:\n#\n# 1. Go to the project/mission page on the PO.DAAC portal (e.g. for Pre-SWOT MEaSUREs: https://podaac.jpl.nasa.gov/MEaSUREs-Pre-SWOT)\n#\n# 2. Choose the dataset of interest. Go to the \"Data Access\" tab of the corresponding dataset landing page, which should like the OPeNDAP access link (for compatible datasets, e.g. for the daily river heights from virtual stations: https://podaac-opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/).\n#\n# 3. Navigate to the desired NetCDF file and copy the endpoint (e.g. for our Amazon Basin use case we choose the South America file: https://opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/South_America_Amazon1kmdaily.nc).\n#\n# ### Open netCDF file with *xarray*\n#\n# Open the netCDF dataset via OPeNDAP using *xarray*:\n\n#%%\n\nds_MEaSUREs = xr.open_dataset(\n 'https://opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/South_America_Amazon1kmdaily.nc')\nprint(ds_MEaSUREs)\n\n\n# Our desired variable is height (meters above EGM2008 geoid) for this exercise, which can be subset by distance and time. Distance represents the distance from the river mouth, in this example, the Amazon estuary. Time is between April 8, 1993 and April 20, 2019.\n#\n# ### Plot\n#\n# **Amazon River heights for March 16, 2018**\n#\n# Plot the river distances and associated heights on the map at time t=9069:\n\n#%%\n\nfig = plt.figure(figsize=[13, 9])\nax = plt.axes(projection=ccrs.PlateCarree())\nax.coastlines()\nax.set_extent([-85, -30, -20, 20])\nax.add_feature(cartopy.feature.RIVERS)\n\nplt.scatter(ds_MEaSUREs.lon, ds_MEaSUREs.lat,\n lw=1, c=ds_MEaSUREs.height[:, 9069])\nplt.colorbar(label='Interpolated River Heights (m)')\nplt.clim(-10, 100)\n\nplt.show()\n\n\n# For GRACE-FO, plotting lwe_thickness[107:179,34,69] indicates time, latitude, and longitude indices corresponding to the pixel for the time period 1/2019 to 12/2019 at lat/lon (-0.7, -50). For the 2019 year, measurements of LWE thickness followd expected patterns of high volume of water from the river output into the estuary.\n#\n# **2011-2019 Seasonality Plots (WIP)**\n#\n# For GRACE-FO, plotting lwe_thickness[107:179,34,69] indicates time, latitude, and longitude indices corresponding to the pixel for the time period 8/2011 to 12/2019 at lat/lon (-0.7, -50).\n\n#%%\n\n# plot variables associated with river\nfig, ax1 = plt.subplots(figsize=[12, 7])\n# plot river height\nds_MEaSUREs.height[16, 6689:9469].plot(color='darkblue')", "target_code": "ax2 = ax1.twinx()\nax2.plot(subset_GRACE.time[107:179],\n subset_GRACE.lwe_thickness[107:179, 34, 69], color='darkorange')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#\n# \n#\n# # __Amazon river freshwater discharge impacts on coastal waters:__\n#\n# ## __Hands-on tutorial of AWS in-region access of NASA Earthdata products__\n#\n#\n# This notebook provides a basic end-to-end workflow to interact with data \"in-place\" from the NASA Earthdata Cloud, by accessing AWS S3 locations provided by [NASA Harmony](http://harmony.earthdata.nasa.gov/) outputs without the need to download data. While these outputs can be downloaded locally, the cloud offers the ability to scale compute resources to perform analyses over large areas and time spans, which is critical as data volumes continue to grow.\n#\n# This workflow combines search, discovery, access, reformatting, basic analyses, and plotting components presented during Part-II. Though the example we're working with in this notebook only focuses on a small time and area to account for a large number of concurrent processing requests, this workflow can be modified and scaled up to suit a larger time range and region of interest.\n#\n# #### Learning objectives:\n#\n# - Understand the Pangeo BinderHub environment used during the workshop and how to execute code within a Jupyter Notebook\n# - Search for Liquid Water Equivalent (LWE) data from GRACE/GRACE-FO and Sea Surface Salinity (SSS) from SMAP\n# - Execute programmatic data access queries, plotting, and direct in-region cloud access using open source Python libraries.\n# - Access data in Zarr format from Earthdata Cloud (AWS)\n# - Subset both, plot and compare coincident data.\n# - Identify resources, including the Earthdata Cloud Primer, for getting started with Amazon Web Services outside of the Workshop to access and work with data with a cloud environment.\n#\n\n# ___\n#\n#

\n# \n# \n#\n#

\n#\n#\n#\n#\n# ### __Pangeo BinderHub and Project Jupyter__\n#\n# First, some basics on the Pangeo compute environment used during the live workshop and how to interact with Jupyter Notebooks and the Jupyter Lab interface.\n#\n# * [Pangeo BinderHub](https://binder.pangeo.io/): A multi-user server for interactive data analysis. This Hub is running in the AWS `us-west-2` region, which is where all Earthdata Cloud data and transformation service outputs are located. Pangeo is supported, in part, by the National Science Foundation (NSF) and the National Aeronautics and Space Administration (NASA). Google provided compute credits on Google Compute Engine. The Pangeo community promotes open, reproducible, and scalable science. We thank you for supporting this AGU Workshop.\n#\n# **This Hub is only supported during the live AGU workshop**. See instructions at the bottom of this notebook for how to set up your own AWS EC2 instance so that you can perform the same cloud access within your personal AWS environment.\n#\n# * [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/latest/): Interactive, reproducible, open source, and exploratory browser integrated computing environment.\n# * [JupyterLab](https://github.com/jupyterlab/jupyterlab): Web-based integrated IDE for computational workflows.\n\n# ### __Jupyter Notebook Basics__\n#\n# The body of a notebook is composed of cells. Each cell contains either markdown, code input, code output, or raw text. Cells can be included in any order and edited and executed at-will.\n#\n# **Markdown cells** - These are used to build a nicely formatted narrative around the code in the document.\n#\n# **Code cells** - These are used to define the computational code in the document. They come in two forms: the input cell where the user types the code to be executed, and the output cell which is the representation of the executed code.\n#\n# **Raw cells** - These are used when text needs to be included in raw form, without execution or transformation.\n#\n# #### Execute a cell or selected cells by pressing shift + enter\n#\n\n\nimport json\nfrom IPython.display import HTML\nimport cartopy\nimport cartopy.crs as ccrs\nimport matplotlib.animation as animation\nimport xarray as xr\nimport pandas as pd\nimport numpy as np\nimport requests\nimport time\nimport dask.array as da\nimport matplotlib.pyplot as plt\nimport zarr\nimport rasterio\nimport s3fs\nimport intake\nfrom pprint import pprint\nfrom os.path import join, expanduser\nfrom http.cookiejar import CookieJar\nfrom urllib import request\nfrom getpass import getpass\nfrom platform import system\nfrom netrc import netrc\nprint('Hello World!')\n\n\n# #### Collapse a cell or cell output by clicking on the blue line to the left of the cell\n#\n# The cell content is replaced by three dots, indicating that the cell is collapsed.\n#\n# #### Execute multiple cells or run the entire notebook\n# Select cells with **shift + Up** or **shift + Down** and then execute selection with **shift + enter**.\n#\n# #### Run the whole notebook in a single step by clicking on the menu Run -> Run All Cells.\n#\n# See https://jupyter.readthedocs.io/en/latest/running.html for more guidance on running notebooks.\n\n# ___\n# ### __Import modules__\n#\n# The Python ecosystem is organized into modules. A module must be imported before the contents of that modules can be used. It is good practice to import modules in the first code cell of a notebook or at the top of your script. Not only does this make it clear which modules are being used, but it also ensures that the code fails at the beginning because one of the modules is not installed rather half way through after crunching a load of data.\n#\n# For some modules, it is common practice to shorten the module names according to accepted conventions. For example, the plotting module `matplotlib.pyplot` is shortened to `plt`. It is best to stick to these conventions rather than making up your own short names so that people reading your code see immediately what you are doing.\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## __Earthdata Login__\n#\n# An Earthdata Login account is required to access data, as well as discover restricted data, from the NASA Earthdata system. Please visit https://urs.earthdata.nasa.gov to register and manage your Earthdata Login account. This account is free to create and only takes a moment to set up.\n#\n# At this point in time (as we are still transitioning to a cloud environment), in order to access data from the Earthdata Cloud, you need special, early access, persmissions. For the workshop today, you have already been added to the list and the Earthdata login you provided prior to the workshop has been granted this access.\n#\n# The `setup_earthdata_login_auth` function will allow Python scripts to log into any Earthdata Login application programmatically. To avoid being prompted for\n# credentials every time you run and also allow clients such as curl to log in, you can add the following\n# to a `.netrc` (`_netrc` on Windows) file in your home directory:\n#\n# ```\n# machine urs.earthdata.nasa.gov\n# login \n# password \n# ```\n#\n# Make sure that this file is only readable by the current user or you will receive an error stating\n# \"netrc access too permissive.\"\n#\n# `$ chmod 0600 ~/.netrc`\n\n\nTOKEN_DATA = (\"\"\n \"%s\"\n \"%s\"\n \"PODAAC CMR Client\"\n \"%s\"\n \"\")\n\n\ndef setup_cmr_token_auth(endpoint: str = 'cmr.earthdata.nasa.gov'):\n ip = requests.get(\"https://ipinfo.io/ip\").text.strip()\n return requests.post(\n url=\"https://%s/legacy-services/rest/tokens\" % endpoint,\n data=TOKEN_DATA % (input(\"Username: \"), getpass(\"Password: \"), ip),\n headers={'Content-Type': 'application/xml',\n 'Accept': 'application/json'}\n ).json()['token']['id']\n\n\ndef setup_earthdata_login_auth(endpoint: str = 'urs.earthdata.nasa.gov'):\n netrc_name = \"_netrc\" if system() == \"Windows\" else \".netrc\"\n try:\n username, _, password = netrc(\n file=join(expanduser('~'), netrc_name)).authenticators(endpoint)\n except (FileNotFoundError, TypeError):\n print('Please provide your Earthdata Login credentials for access.')\n print('Your info will only be passed to %s and will not be exposed in Jupyter.' % (\n endpoint))\n username = input('Username: ')\n password = getpass('Password: ')\n manager = request.HTTPPasswordMgrWithDefaultRealm()\n manager.add_password(None, endpoint, username, password)\n auth = request.HTTPBasicAuthHandler(manager)\n jar = CookieJar()\n processor = request.HTTPCookieProcessor(jar)\n opener = request.build_opener(auth, processor)\n request.install_opener(opener)\n\n\n# Get your authentication token for searching restricted records in the CMR:\n_token = setup_cmr_token_auth(endpoint=\"cmr.earthdata.nasa.gov\")\n\n# Start authenticated session with URS to allow restricted data downloads:\nsetup_earthdata_login_auth(endpoint=\"urs.earthdata.nasa.gov\")\n\n\n# ___\n# ## __Data Search and Discovery__\n#\n# Background on the two datasets we're working with in this tutorial:\n#\n# [**JPL GRACE and GRACE-FO Mascon Ocean, Ice, and Hydrology Equivalent Water Height Coastal Resolution Improvement (CRI) Filtered Release 06 Version 02**](https://podaac.jpl.nasa.gov/dataset/TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2)\n#\n# Provides land water equivalent (LWE) thickness for observing seasonal changes in water storage around the river. When discharge is high, the change in water storage will increase, pointing to a wet season. This product provides gridded monthly global water storage/height anomalies in a single data file in netCDF format, and can be used for analysis for ocean, ice, and hydrology phenomena. Source data are from [GRACE](https://podaac.jpl.nasa.gov/GRACE) and [GRACE-FO](https://podaac.jpl.nasa.gov/GRACE-FO)\n#\n# [**RSS SMAP Level 3 Sea Surface Salinity Standard Mapped Image 8-Day Running Mean V4.0 Validated Dataset**](https://podaac.jpl.nasa.gov/SMAP?sections=data)\n#\n# Daily data files (NetCDF format) for this product are based on Sea Surface Salinity averages spanning an 8-day moving time window, gridded at 0.25 degree x 0.25 degree.\n\n# #### First, define the region of interest over the Amazon river basin and set the temporal range for the year 2019:\n\n\n# Bounding Box spatial parameter in decimal degree 'W,S,E,N' format.\nbounding_box = '-52,-2,-43,6'\n\n# Each date in yyyy-MM-ddTHH:mm:ssZ format; date range in start,end format\ntemporal = '2019-01-01T00:00:00Z,2019-12-31T23:59:59Z'\n\n\n# #### Set up a nested dictionary with the two data products of interest\n#\n# Before we search programmatically using the [Common Metadata Repository (CMR)](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html), we can use NASA Earthdata Search to visualize file coverage over multiple data sets and to access the same data you will be working with below:\n# [Earthdata Search project](https://search.earthdata.nasa.gov/projects?p=!C1938032626-POCLOUD!C1940468263-POCLOUD!C1650311642-PODAAC!C1664148252-PODAAC&pg[1][v]=t&pg[1][gsk]=-start_date&pg[2][v]=t&pg[2][gsk]=-start_date&pg[3][v]=t&pg[4][v]=t&q=TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2&sb[0]=-52%2C-2%2C-43%2C6&m=2.00390625!-47.50048828125!6!1!0!0%2C2&qt=2019-04-01T00%3A00%3A00.000Z%2C2019-04-30T23%3A59%3A59.999Z&tl=1591244551!4!!)\n\n\nsearch_parameters = {\n 'grace': {\n 'short_name': 'TELLUS_GRAC-GRFO_MASCON_CRI_GRID_RL06_V2',\n 'provider': 'POCLOUD',\n 'bounding_box': bounding_box,\n 'temporal': temporal,\n 'token': _token},\n 'smap': {\n 'short_name': 'SMAP_RSS_L3_SSS_SMI_8DAY-RUNNINGMEAN_V4',\n 'provider': 'POCLOUD',\n 'bounding_box': bounding_box,\n 'temporal': temporal,\n 'token': _token},\n}\n\n\n# #### Discover file number and file size\n#\n# Using CMR search, determine the number of files that exist over this time and area of interest, as well as the average size and total volume of those files.\n\n\nsearch_url = \"https://cmr.earthdata.nasa.gov/search/granules\"\noutput_format = \"json\"\n\n# fn.search_granules(search_parameters[k], _token)\nfor k, v in search_parameters.items():\n parameters = {\n \"scroll\": \"true\",\n \"page_size\": 100,\n }\n\n response = requests.post(\n f\"{search_url}.{output_format}\", params=parameters, data=v)\n response.raise_for_status()\n\n hits = int(response.headers['CMR-Hits'])\n if hits > 0:\n print(f\"Found {hits} files\")\n results = json.loads(response.content)\n granules = []\n granules.extend(results['feed']['entry'])\n granule_sizes = [float(granule['granule_size'])\n for granule in granules]\n print(f\"The total size of all files is {sum(granule_sizes):.2f} MB\")\n else:\n print(\"Found no hits\")\n\n\n# #### Locate the data access URLs provided by the data product (collection) metadata:\n#\n# GRACE data:\n\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/granules.umm_json\",\n params=search_parameters['grace'])\ngrace_gran = r.json()\nprint(\"files returned:\", grace_gran['hits'])\ngrace_gran['items'][0]['umm']['RelatedUrls']\n\n\n# and SMAP data:\n\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/granules.umm_json\",\n params=search_parameters['smap'])\n\nsmap_gran = r.json()\nprint(\"files returned:\", smap_gran['hits'])\nsmap_gran['items'][0]['umm']['RelatedUrls']\n\n\n# ___\n# ## __Data Access from NASA Earthdata Harmony__\n#\n# As you have seen so far in this workshop, there are several different methodologies to search and access data. The URLs above can be located with the data granule (file) metadata, and pulled into a list that can then be bulk downloaded as you've seen in the previous tutorials.\n#\n# [Harmony](https://harmony.earthdata.nasa.gov/) is a growing effort across NASA EOSDIS community to provide a consistent method to access and perform data subsetting and transformation services for data in the Earthdata Cloud. These services are processed in the cloud, with data archived in the cloud, and outputs can be accessed by downloading to a local machine or through direct in-region access via Amazon Web Services. These next steps walk through this access method to access the data outputs directly within the AWS `us-west-2` region so that we can read the data into memory within this AWS hub environment.\n\n# #### __Get collection ID for Harmony__\n#\n# Harmony operates on an input Collection concept-id (a CMR construct). Here's how you identify the CMR concept-id for the data products of interest:\n\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/collections.umm_json\",\n params=search_parameters['grace'])\n\ngrace_coll = r.json()\ngrace_coll['hits']\ngrace_coll_meta = grace_coll['items'][0]['meta']\ngrace_coll_id = grace_coll_meta['concept-id']\nprint('GRACE collection id:', grace_coll_id)\n\nr = requests.get(url=\"https://cmr.earthdata.nasa.gov/search/collections.umm_json\",\n params=search_parameters['smap'])\n\nsmap_coll = r.json()\nsmap_coll['hits']\n\nsmap_coll_meta = smap_coll['items'][0]['meta']\nsmap_coll_id = smap_coll_meta['concept-id']\nprint('SMAP collection id:', smap_coll_id)\n\n\n# #### __Request SMAP data reformatted to Zarr__:\n#\n# [Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. Zarr also enables you to store arrays in memory, on disk, inside a Zip file, or on S3. Harmony's Zarr Reformatter service will transform the SMAP data from its native NetCDF format to Zarr, to allow us to open and download/read just the data that we require for our Amazon Basin study area.\n#\n#\n# For our science use case, ideally we'd also want to request an entire year but since this can take a long time to process with many concurrent requests during this live workshop, we will just request a month of data for demonstration purposes.\n\n\nharmony_root = 'https://harmony.earthdata.nasa.gov'\nharmony_params = {\n 'collection_id': smap_coll_id,\n 'ogc-api-coverages_version': '1.0.0',\n 'variable': 'all',\n 'lat': '(-2:6)',\n 'lon': '(-52:-43)',\n 'start': '2019-04-01T00:00:00Z',\n 'stop': '2019-04-30T23:59:59Z',\n 'format': 'application/x-zarr',\n}\n\nsmap_url = harmony_root + \\\n '/{collection_id}/ogc-api-coverages/{ogc-api-coverages_version}/collections/{variable}/coverage/rangeset?format={format}&subset=lat{lat}&subset=lon{lon}&subset=time(\"{start}\":\"{stop}\")'.format(\n **harmony_params)\nprint(smap_url)\n\n\nsmap_response = request.urlopen(smap_url)\nsmap_results = smap_response.read()\nsmap_json = json.loads(smap_results)\nprint(json.dumps(smap_json, indent=2))\nsmap_jobId = smap_json['jobID']\n\n\nsmap_job_url = f'https://harmony.earthdata.nasa.gov/jobs/{smap_jobId}'\n\nwhile True:\n loop_response = request.urlopen(smap_job_url)\n loop_results = loop_response.read()\n job_json = json.loads(loop_results)\n if job_json['status'] != 'running':\n break\n print(\n f\"# Job status is running. Progress is {job_json['progress']} %. Trying again.\")\n time.sleep(10)\n\nsmap_links = []\nif job_json['status'] == 'successful' and job_json['progress'] == 100:\n print(\"# Job progress is 100%. Links to job outputs are displayed below:\")\n smap_links = [link['href'] for link in job_json['links']]\n display(smap_links)\n\n\nsmap_zarr_urls = smap_links[5:]\nsmap_zarr_urls\n\n\n# **Access credentials for the output zarr file**\n#\n# Credentials provided in the Harmony job response provide authenticated access to your staged S3 resources.\n#\n# Grab the credentials as a JSON string, load to a Python dictionary, and display their expiration date:\n\n\nwith request.urlopen(f\"https://harmony.earthdata.nasa.gov/cloud-access\") as f:\n creds = json.loads(f.read())\n\ncreds['Expiration']\n\n\n# ### Open staged zarr files with *s3fs*\n#\n# We use the AWS `s3fs` package to get metadata about the zarr data store and read in the credentials we pulled from our Harmony job response:\n\n\nzarr_fs = s3fs.S3FileSystem(\n key=creds['AccessKeyId'],\n secret=creds['SecretAccessKey'],\n token=creds['SessionToken'],\n client_kwargs={'region_name': 'us-west-2'},\n)\n\n\n# #### SMAP loaded into xarray\n#\n# `xarray` is a python package designed to work with multi-dimensional arrays. See the [xarray website](http://xarray.pydata.org/en/stable/) for more information. This next code block will take all of the month's worth of SMAP data we pulled above from Harmony into xarray in a single command.\n\n\nsmap_zarr_urls = smap_links[5:]\nsmap_zarr_stores = [zarr_fs.get_mapper(\n root=u, check=False) for u in smap_zarr_urls]\nds_SMAP = xr.open_mfdataset(smap_zarr_stores, engine=\"zarr\")\n\nprint(ds_SMAP)\n\n\n# ____\n# ### __We'll come back to our SMAP data now that it's loaded into xarray.__\n#\n# **We'll now move into breakout rooms to work through the same data acces steps as above with the GRACE data.**\n#\n# As you run through each of the code blocks, worth within your groups to ask questions and discuss the workflow. Instructors will move in and out of breakout rooms to offer assistance.\n\n# #### __Request GRACE data reformatted to Zarr__:\n\n\nharmony_root = 'https://harmony.earthdata.nasa.gov'\nharmony_params = {\n 'collection_id': grace_coll_id,\n 'ogc-api-coverages_version': '1.0.0',\n 'variable': 'all',\n 'lat': '(-2:6)',\n 'lon': '(-52:-43)',\n 'start': '2019-01-01T00:00:00Z',\n 'stop': '2019-12-31T23:59:59Z',\n 'format': 'application/x-zarr',\n}\n\ngrace_url = harmony_root + \\\n '/{collection_id}/ogc-api-coverages/{ogc-api-coverages_version}/collections/{variable}/coverage/rangeset?format={format}&subset=lat{lat}&subset=lon{lon}&subset=time(\"{start}\":\"{stop}\")'.format(\n **harmony_params)\nprint(grace_url)\n\n\ngrace_response = request.urlopen(grace_url)\ngrace_results = grace_response.read()\ngrace_json = json.loads(grace_results)\nprint(json.dumps(grace_json, indent=2))\ngrace_jobId = grace_json['jobID']\n\n\ngrace_job_url = f'https://harmony.earthdata.nasa.gov/jobs/{grace_jobId}'\n\nwhile True:\n loop_response = request.urlopen(grace_job_url)\n loop_results = loop_response.read()\n job_json = json.loads(loop_results)\n if job_json['status'] != 'running':\n break\n print(\n f\"# Job status is running. Progress is {job_json['progress']} %. Trying again.\")\n time.sleep(10)\n\ngrace_links = []\nif job_json['status'] == 'successful' and job_json['progress'] == 100:\n print(\"# Job progress is 100%. Links to job outputs are displayed below:\")\n grace_links = [link['href'] for link in job_json['links']]\n display(grace_links)\n\n\n# #### __Access urls for the output zarr files__\n#\n# The new zarr dataset is staged for us in an S3 bucket. The url is the last one in the list shown above.\n#\n# Select the url and display below:\n\n\ngrace_zarr_urls = grace_links[-1]\ngrace_zarr_urls\n\n\ngrace_zarr_store = zarr_fs.get_mapper(root=grace_zarr_urls, check=False)\ngrace_zarr_dataset = zarr.open(grace_zarr_store)\n\nprint(grace_zarr_dataset.tree())\n\n\nprint(grace_zarr_dataset.lwe_thickness.info)\n\n\n# #### __Open staged zarr file with *xarray*__\n#\n# Read more about `xarray`'s zarr reader here: http://xarray.pydata.org/en/stable/generated/xarray.open_zarr.html\n#\n# This xarray method allows you to pull in the Zarr outputs from Harmony directly into an xarray dataset.\n#\n# Open the zarr dataset and print the dataset \"header\":\n\n\nds_GRACE = xr.open_zarr(grace_zarr_store)\nprint(ds_GRACE)\n\n\n# **Subset by Latitude/Longitude**\n#\n# Once we have obtained all the data, to make processing quicker, we are going to subset datasets by latitude/longitude for the Amazon River estuary.\n#\n# Once we have obtained the GRACE-FO data, we should spatial subset the data to the minimal area covering the Amazon River estuary. This will reduce processing load and reduce cloud costs for the user.\n#\n# Make a GRACE-FO subset and display the min, max of the *lat* and *lon* variables:\n\n\nsubset_GRACE = ds_GRACE.sel(lat=slice(-18, 10), lon=slice(275, 330))\nprint(subset_GRACE.lat.min().data,\n subset_GRACE.lat.max().data,\n subset_GRACE.lon.min().data,\n subset_GRACE.lon.max().data)\n\n\n# **Select the variable for Land Water Equivalent Thickness (*lwe_thickness*)**\n#\n# Grab the land water equivalent thickness variable from the GRACE subset:\n\n\nlwe = subset_GRACE.lwe_thickness\nprint(lwe)\n\n\n# ### __Return to the main room__\n#\n# Now that the GRACE data have also been loaded into xarray and subsetted, we'll return to the main room to begin our plotting and analysis.\n# ____\n\n# ## __Plotting and analysis__\n\n# __First we'll plot and animate the GRACE data over time, and then we'll subset and plot the SMAP data.__\n#\n# Now that we have a time/space slice of interest from the GRACE collection, let's plot the first time step, to see what we've got (or to see what it looks like. First we'll define two functions to make the plotting (and next animation step) a bit more convenient:\n\n\ndef setup_map(ax, pmap, ds_subset, x, y, var, t, cmap, levels, extent):\n title = str(pd.to_datetime(ds_subset.time[t].values))\n pmap.set_title(title, fontsize=14)\n pmap.coastlines()\n pmap.set_extent(extent)\n pmap.add_feature(cartopy.feature.RIVERS)\n variable_desired = var[t, :, :]\n cont = pmap.contourf(x, y, variable_desired,\n cmap=cmap, levels=levels, zorder=1)\n return cont\n\n\ndef animate_ts(framenumber, ax, pmap, ds_subset, x, y, var, t, cmap, levels, extent):\n cont = setup_map(ax, pmap, ds_subset, x, y, var, t +\n framenumber, cmap, levels, extent)\n return cont\n\n\n# Initialize a matplotlib plot object and add subplot:\nfig = plt.figure(figsize=[13, 9])\nax = fig.add_subplot(1, 1, 1)\n\n# Configure axes to display projected data using PlateCarree crs:\npmap = plt.axes(projection=ccrs.PlateCarree())\n\n# Get arrays of x and y to label the plot axes:\nx, y = np.meshgrid(subset_GRACE.lon, subset_GRACE.lat)\n\n# Set a few constants for plotting the GRACE-FO time series:\ntime_start = 168\ncmap_name = \"bwr_r\"\ncmap_levels = np.linspace(-100., 100., 14)\nmap_extent = [-85, -30, -16, 11]\n\n# Plot the first timestep:\ncont = setup_map(ax, pmap, subset_GRACE, x, y, lwe,\n time_start, cmap_name, cmap_levels, map_extent)\n\nfig.colorbar(cont, ticks=cmap_levels, orientation='horizontal',\n label='Land Water Equivalent Thickness (cm)')\n\n\n# #### Animate changes over time\n#\n# Let's now explore how land water mass changes throughout the year 2019, by creating an animation of GRACE monthly land water equivalent (LWE) maps over the Amazon River.\n#\n# Plot all the 2019 timesteps sequentially to create an animation of land water equivalent thickness for the Amazon Rainforest territories:\n\n\nani = animation.FuncAnimation(fig, animate_ts, frames=range(0, 12), fargs=(\n ax, pmap, subset_GRACE, x, y, lwe, time_start, cmap_name, cmap_levels, map_extent\n), interval=500)\n\nHTML(ani.to_html5_video())\n\n\n# User note: You will need to install 'ffmpeg' in the cmd prompt to save the .mpg to disk. Use the following command to install from the conda-forge channel:\n#\n# ```shell\n# conda install -c conda-forge ffmpeg\n# ```\n#\n# Uncomment, run the next cell to save the animation to MP4:\n\n\n#ani.save(\"earthdatacloud_animation_GRACEFO.mp4\", writer=animation.FFMpegWriter())\n\n\n# ### __SMAP Data Analysis__\n#\n# Normally, we'd pull the entire 2019 data for the SMAP collections as well (recall we only requested 1 month of data earlier in the notebook, due to current system constraints), average the daily SMAP data to monthly means, and compare with the monthly GRACE in a 2019 monthly time series.\n#\n# For now, let's create the building blocks for that workflow, by creating the 1 monthly mean for SMAP, plotting a spatial map of the SMAP SSS monthly mean to take a quick look at that data, and setting up the 2019 monthly time series plotting (without completing here).\n\n# Frist, we can subset the SMAP data as we did above with GRACE. Note the coordinate syntax differs slightly as noted below.\n\n\n# SMAP\n# switched lat directions from GRACE, and longitude has positives and negatives\nlat_bnds, lon_bnds = [-2, 6], [180-52, 180-43]\nds_SMAP_subset = ds_SMAP.sel(lat=slice(*lat_bnds), lon=slice(*lon_bnds))\nprint(ds_SMAP_subset.lat.min().data,\n ds_SMAP_subset.lat.max().data,\n ds_SMAP_subset.lon.min().data,\n ds_SMAP_subset.lon.max().data,)\nds_SMAP_subset\n\n\n# **Important**: remember to use `compute()` so that the `scale_factor` is applied!\n#\n# We'll also replace fill values with numpy.nan for our next monthly mean computation step.\n\n\n# Scale the values to their valid range (compute):\nsss_data = ds_SMAP_subset.sss_smap.data.compute()\n\n# Replace fills with numpy nan:\nsss_data[sss_data == -9999.] = np.nan\n\n# Replace the data array for the sss monthly mean variable:\nds_SMAP_subset.sss_smap.data = sss_data\n\nprint(ds_SMAP_subset.sss_smap)\n\n\n# Now compute monthly means using the convenient xarray `groupby` method and view the minimum and maximum monthly mean values.\n\n\nds_SMAP_subset_sss_momean = ds_SMAP_subset.sss_smap.groupby(\n \"time.month\").mean(skipna=True)\nds_SMAP_subset_sss_momean.min(\n skipna=True), ds_SMAP_subset_sss_momean.max(skipna=True)\n\n\n# #### __Plot the April mean SMAP SSS__\n#\n# Creating a basic plot is easy using the built-in plotting capabilities of xarray:\n\n\n# plot SMAP subset\nds_SMAP_subset_sss_momean.sel(month=4).plot()\nplt.show()\n\n\nsss_min, sss_max = [ds_SMAP_subset.sss_smap.min(skipna=True).compute(\n).data.item(), ds_SMAP_subset.sss_smap.max(skipna=True).compute().data.item()]\nsss_min, sss_max\n\n\n# A new figure window\nfig = plt.figure(figsize=[10, 8])\nax = fig.add_subplot(1, 1, 1) # specify (nrows, ncols, axnum)\npmap = plt.axes(projection=ccrs.PlateCarree())\n\n# Necessary Variables for functions\nextent = [180-52, 180-43, -2, 6] # lat/lon extents of map\n# x, y lat/lon values for functions\nx, y = np.meshgrid(ds_SMAP_subset.lon, ds_SMAP_subset.lat)\n# number of levels for color differentiation\nlevels = np.linspace(sss_min, sss_max, 10)\ncmap = 'viridis' # color scheme\nt = 1 # time to start with\n# variable we will be subsetting from the GRACE-FO data\nvar = ds_SMAP_subset.sss_smap.compute()\n# Time of specific time step\ntitle = str(pd.to_datetime(ds_SMAP_subset.time[t].values))\n\n# Set up first time step\ncont = setup_map(ax, pmap, ds_SMAP_subset, x, y, var, t, cmap, levels, extent)\n\n# Make a color bar\nfig.colorbar(cont, cmap=cmap, boundaries=levels, ticks=levels,\n orientation='horizontal', label='Sea Surface Salinity (psu)')\n\n\n# #### Animate April mean SMAP SSS\n#\n# Like we did with GRACE, we can animate the changes for the single month of SMAP data to see how salinty changes over time.\n\n\n# Create animation for April 2019 (change the frame range for different time periods)\nani = animation.FuncAnimation(fig, animate_ts, frames=range(0, ds_SMAP_subset.time.size-1), fargs=(\n ax, pmap, ds_SMAP_subset, x, y, var, t, cmap, levels, extent\n), interval=400)\n\nHTML(ani.to_html5_video())\n\n\n# ## __Tutorial Summary and Additional Resources__\n#\n# The building blocks are now in place to do a longer time series analysis across GRACE and SMAP data, to better understand the relationship between river discharge and sea surface salinity for impact assessment.\n#\n# To conclude, we've searched programmatically for data archived in the PO.DAAC Earthdata Cloud over a region and time period of interest, requested the data from the Harmony API, read the data directly into `xarray` from the staged s3 location within AWS `us-west-2` without having to pull the data down into local storage, and performed subsetting and plotting in preparation for a time series analysis.\n#\n# There are two more sections of resources below:\n#\n# 1. __Adding river height (pre-SWOT) data to our our analysis__\n# - Pre-SWOT data are available through the PO DAAC on-premise location, so you can continue your analysis with data both in and outside of the cloud using OPeNDAP.\n#\n# 2. __How to set up a Jupyter Notebook running in your own EC2 instance__\n# - This same workflow can be achieved outside of the Pangeo BinderHub within an EC2 instance running your personal AWS account. More information and instructions are available below.\n\n# ___\n#\n# #### __On-prem hydro data from Pre-SWOT MEaSUREs program__\n#\n# Data from [**PRESWOT_HYDRO_GRRATS_L2_DAILY_VIRTUAL_STATION_HEIGHTS_V2**](https://podaac.jpl.nasa.gov/dataset/PRESWOT_HYDRO_GRRATS_L2_DAILY_VIRTUAL_STATION_HEIGHTS_V2) are not currently available on the cloud, but we can access via the PO.DAAC's on-prem OPeNDAP service (Hyrax) instead.\n#\n# \n#\n# The guidebook explains the details of the Pre-SWOT MEaSUREs data: https://podaac-tools.jpl.nasa.gov/drive/files/allData/preswot_hydrology/L2/rivers/docs/GRRATS_user_handbookV2.pdf\n#\n# **Access URL for PO.DAAC on-prem OPeNDAP service**\n#\n# Identify an appropriate OPeNDAP endpoint through the following steps:\n#\n# 1. Go to the project/mission page on the PO.DAAC portal (e.g. for Pre-SWOT MEaSUREs: https://podaac.jpl.nasa.gov/MEaSUREs-Pre-SWOT)\n#\n# 2. Choose the dataset of interest. Go to the \"Data Access\" tab of the corresponding dataset landing page, which should like the OPeNDAP access link (for compatible datasets, e.g. for the daily river heights from virtual stations: https://podaac-opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/).\n#\n# 3. Navigate to the desired NetCDF file and copy the endpoint (e.g. for our Amazon Basin use case we choose the South America file: https://opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/South_America_Amazon1kmdaily.nc).\n#\n# ### Open netCDF file with *xarray*\n#\n# Open the netCDF dataset via OPeNDAP using *xarray*:\n\n\nds_MEaSUREs = xr.open_dataset(\n 'https://opendap.jpl.nasa.gov/opendap/allData/preswot_hydrology/L2/rivers/daily/South_America_Amazon1kmdaily.nc')\nprint(ds_MEaSUREs)\n\n\n# Our desired variable is height (meters above EGM2008 geoid) for this exercise, which can be subset by distance and time. Distance represents the distance from the river mouth, in this example, the Amazon estuary. Time is between April 8, 1993 and April 20, 2019.\n#\n# ### Plot\n#\n# **Amazon River heights for March 16, 2018**\n#\n# Plot the river distances and associated heights on the map at time t=9069:\n\n\nfig = plt.figure(figsize=[13, 9])\nax = plt.axes(projection=ccrs.PlateCarree())\nax.coastlines()\nax.set_extent([-85, -30, -20, 20])\nax.add_feature(cartopy.feature.RIVERS)\n\nplt.scatter(ds_MEaSUREs.lon, ds_MEaSUREs.lat,\n lw=1, c=ds_MEaSUREs.height[:, 9069])\nplt.colorbar(label='Interpolated River Heights (m)')\nplt.clim(-10, 100)\n\nplt.show()\n\n\n# For GRACE-FO, plotting lwe_thickness[107:179,34,69] indicates time, latitude, and longitude indices corresponding to the pixel for the time period 1/2019 to 12/2019 at lat/lon (-0.7, -50). For the 2019 year, measurements of LWE thickness followd expected patterns of high volume of water from the river output into the estuary.\n#\n# **2011-2019 Seasonality Plots (WIP)**\n#\n# For GRACE-FO, plotting lwe_thickness[107:179,34,69] indicates time, latitude, and longitude indices corresponding to the pixel for the time period 8/2011 to 12/2019 at lat/lon (-0.7, -50).\n\n\n# plot variables associated with river\nfig, ax1 = plt.subplots(figsize=[12, 7])\n# plot river height\nds_MEaSUREs.height[16, 6689:9469].plot(color='darkblue')\n", "project_metadata": {"full_name": "podaac/AGU-2020", "description": "PO.DAAC & NSIDC DAAC Github repository for AGU 2020 workshop.", "topics": [], "git_url": "git://github.com/podaac/AGU-2020.git", "stars": 7, "watchers": 7, "forks": 8, "created": "2020-10-29T22:01:50Z", "size": 92218, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1025184, "Python": 24228}, "last_updated": "2020-12-09T19:03:46Z"}, "intent": "# plot LWE thickness on secondary axis"}, {"original_comment": "# Newton-Rhapson Solver: Create a flopy nwt package object\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# 2D Example demonstrating a spill and well separated by a stream.\n\n#%%\n\nimport sys\nimport math\nfrom io import StringIO\nimport os\nimport shutil\nimport platform\nimport numpy as np\nfrom subprocess import check_output\nimport flopy\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport flopy.utils.binaryfile as bf\n\nimport config\n\nprint(np.__version__)\n\nmodelpth = os.path.join('Cara_model')\nmodelname = 'Spill'\nmfexe = 'mfnwt'\nmtexe = 'mt3dusgs'\nif platform.system() == 'Windows':\n mfexe += '.exe'\n mtexe += '.exe'\n\n# Instantiate MODFLOW model\nmf = flopy.modflow.Modflow(modelname, version='mfnwt', exe_name=mfexe,\n model_ws=modelpth)\n\n# Output Control: Create a flopy output control object\noc = flopy.modflow.ModflowOc(mf, stress_period_data={\n (0, 0): ['save head', 'save budget']})", "target_code": "headtol = 0.0001\nfluxtol = 0.06\nmaxiterout = 200\nthickfact = 1E-005\nlinmeth = 2\niprnwt = 1\nibotav = 0\nnwt = flopy.modflow.ModflowNwt(mf, headtol=headtol, fluxtol=fluxtol, maxiterout=maxiterout,\n thickfact=thickfact, linmeth=linmeth, iprnwt=iprnwt, ibotav=ibotav,\n options='COMPLEX')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# 2D Example demonstrating a spill and well separated by a stream.\n\n\nimport sys\nimport math\nfrom io import StringIO\nimport os\nimport shutil\nimport platform\nimport numpy as np\nfrom subprocess import check_output\nimport flopy\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport flopy.utils.binaryfile as bf\n\nimport config\n\nprint(np.__version__)\n\nmodelpth = os.path.join('Cara_model')\nmodelname = 'Spill'\nmfexe = 'mfnwt'\nmtexe = 'mt3dusgs'\nif platform.system() == 'Windows':\n mfexe += '.exe'\n mtexe += '.exe'\n\n# Instantiate MODFLOW model\nmf = flopy.modflow.Modflow(modelname, version='mfnwt', exe_name=mfexe,\n model_ws=modelpth)\n\n# Output Control: Create a flopy output control object\noc = flopy.modflow.ModflowOc(mf, stress_period_data={\n (0, 0): ['save head', 'save budget']})\n", "project_metadata": {"full_name": "langevin-usgs/gw3099_classrepo", "description": "Repository for the Advanced Modeling of Groundwater Flow training class", "topics": [], "git_url": "git://github.com/langevin-usgs/gw3099_classrepo.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2018-09-25T21:39:21Z", "size": 161866, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2635284, "Python": 433498, "Visual Basic": 60716, "Batchfile": 1416}, "last_updated": "2020-12-18T19:27:44Z"}, "intent": "# Newton-Rhapson Solver: Create a flopy nwt package object"}, {"original_comment": "# Make sure we are using PyTorch 1.5\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Table of Contents

\n# \n\n#

Setup

\n#\n#\n#
\n\n# Setup env on your local computer\n# -----\n#\n# 1. Install [Anaconda distribution Python 3.x version](https://www.anaconda.com/download/)\n# 1. `cd` into `resources/`\n# 1. Run `$ sh setup.sh`\n# 1. Wait \u231b\n# 1. Run\n# 1. `$ conda activate rl`\n# 2. `$ jupyter notebook`\n\n#%%\n\nfrom platform import python_version\nimport sys\n\n# Make sure the environment was started\nassert sys.base_prefix.endswith(\"rl\")\n\n#%%\n\n# Make sure we are using Python 3.7\nassert python_version().startswith('3.7')\n\n#%%\n\n# Kick the tires\ntry:\n import numpy as np\n import tensorflow as tf\n import torch\nexcept ModuleNotFoundError:\n print(\"Ask for help \u270b\")\n\n#%%\n\n# Make sure we are using at least TensorFlow 2.0\nassert tf.__version__.startswith('2.')\n\n#%%", "target_code": "assert torch.__version__.startswith('1.5')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Table of Contents

\n# \n\n#

Setup

\n#\n#\n#
\n\n# Setup env on your local computer\n# -----\n#\n# 1. Install [Anaconda distribution Python 3.x version](https://www.anaconda.com/download/)\n# 1. `cd` into `resources/`\n# 1. Run `$ sh setup.sh`\n# 1. Wait \u231b\n# 1. Run\n# 1. `$ conda activate rl`\n# 2. `$ jupyter notebook`\n\n\nfrom platform import python_version\nimport sys\n\n# Make sure the environment was started\nassert sys.base_prefix.endswith(\"rl\")\n\n\n# Make sure we are using Python 3.7\nassert python_version().startswith('3.7')\n\n\n# Kick the tires\ntry:\n import numpy as np\n import tensorflow as tf\n import torch\nexcept ModuleNotFoundError:\n print(\"Ask for help \u270b\")\n\n\n# Make sure we are using at least TensorFlow 2.0\nassert tf.__version__.startswith('2.')\n\n", "project_metadata": {"full_name": "brianspiering/rl-course", "description": "Introduction to Reinforcement Learning for MSDS at University of San Francisco", "topics": [], "git_url": "git://github.com/brianspiering/rl-course.git", "stars": 8, "watchers": 8, "forks": 6, "created": "2020-03-22T00:37:02Z", "size": 49772, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1056987, "Python": 6981, "Shell": 1569}, "last_updated": "2020-12-31T06:14:26Z"}, "intent": "# Make sure we are using PyTorch 1.5"}, {"original_comment": "# let's check the percentage of missing data\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Missing Values\n#\n# Missing data, or missing values, occur when __no data__ / __no value__ is stored for certain observations within a variable.\n#\n# Incomplete data is an unavoidable problem in most data sources, and may have a significant impact on the conclusions that can be derived from the data.\n#\n# ### Why is data missing?\n#\n# The source of missing data can be very different. These are just a few examples:\n#\n# - A value is missing because it was forgotten, lost or not stored properly\n# - For a certain observation, the value does not exist\n# - The value can't be known or identified\n#\n# In many organisations, information is collected into a form by a person talking with a client on the phone, or alternatively, by customers filling forms online. Often, the person entering the data does not complete all the fields in the form. Many of the fields are not compulsory, which may lead to missing values.\n#\n# The reasons for omitting the information can vary: perhaps the person does not want to disclose some information, for example income, or they do not know the answer, or the answer is not applicable for a certain circumstance, or on the contrary, the person in the organisation wants to spare the customer some time, and therefore omits asking questions they think are not so relevant.\n#\n# There are other cases where the value for a certain variable does not exist. For example, in the variable 'total debt as percentage of total income' (very common in financial data), if the person has no income, then the total percentage of 0 does not exist, and therefore it will be a missing value.\n#\n# It is important to understand **how the missing data are introduced in the dataset**, that is, the **mechanisms** by which missing information is introduced in a dataset. Depending on the mechanism, we may choose to process the missing values differently. In addition, by knowing the source of missing data, we may choose to take action to control that source and decrease the amount of missing information looking forward during data collection.\n#\n#\n# ### Missing Data Mechanisms\n#\n# There are 3 mechanisms that lead to missing data, 2 of them involve missing data randomly or almost-randomly, and the third one involves a systematic loss of data.\n#\n# #### Missing Completely at Random, MCAR:\n#\n# A variable is missing completely at random (MCAR) if the probability of being missing is the same for all the observations.\n# When data is MCAR, there is absolutely no relationship between the data missing and any other values, observed or missing, within the dataset. In other words, those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than other. If values for observations are missing completely at random, then disregarding those cases would not bias the inferences made.\n#\n#\n# #### Missing at Random, MAR:\n#\n# MAR occurs when there is a relationship between the propensity of missing values and the observed data. In other words, the probability of an observation being missing depends on available information (i.e., other variables in the dataset). For example, if men are more likely to disclose their weight than women, weight is MAR. The weight information will be missing at random for those men and women who do not disclose their weight, but as men are more prone to disclose it, there will be more missing values for women than for men.\n#\n# In a situation like the above, if we decide to proceed with the variable with missing values (in this case weight), we might benefit from including gender to control the bias in weight for the missing observations.\n#\n#\n# #### Missing Not at Random, MNAR:\n#\n# Missing data is not at random (MNAR) when there is a mechanism or a reason why missing values are introduced in the dataset. For example, MNAR would occur if people failed to fill in a depression survey because of their level of depression. Here, the missing of data is related to the outcome, depression. Similarly, when a financial company asks for bank and identity documents from customers in order to prevent identity fraud, typically, fraudsters impersonating someone else will not upload documents, because they don't have them, because they are fraudsters. Therefore, there is a systematic relationship between the missing documents and the target we want to predict: fraud.\n#\n# Understanding the mechanism by which data is missing is important to decide which methods to use to impute the missing values.\n#\n# ====================================================================================================\n\n# ## In this Demo:\n#\n# In the following cells we will:\n#\n# - Learn how to detect and quantify missing values\n#\n# - Try to identify the 3 different mechanisms of missing data introduction\n#\n# We will use the Loan book data from Lending Club and the Titanic dataset.\n#\n# - To download the datasets, please refer to the **Datasets** lecture in **Section 1** of the course.\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n# to display the total number columns present in the dataset\npd.set_option('display.max_columns', None)\n\n#%%\n\n# let's load the titanic dataset\ndata = pd.read_csv('../titanic.csv')\n\n# let's inspect the first 5 rows\ndata.head()\n\n\n# In python, the missing values are stored as NaN, see for example the first row for the variable Cabin.\n\n#%%\n\n# we can quantify the total number of missing values using\n# the isnull method plus the sum method on the dataframe\n\ndata.isnull().sum()\n\n\n# There are 263 missing values for Age, 1014 for Cabin and 2 for Embarked.\n\n#%%\n\n# alternatively, we can use the mean method after isnull\n# to visualise the percentage of\n# missing values for each variable\n\ndata.isnull().mean()\n\n\n# There are missing data in the variables Age (20% missing), Cabin -in which the passenger was traveling- (77% missing), and Embarked -the port from which the passenger got into the Titanic- (~0.2% missing).\n\n# ## Mechanisms of Missing Data\n#\n# ### Missing data Not At Random (MNAR): Systematic missing values\n#\n# In the Titanic dataset, both the missing values of the variables **age** and **cabin**, were introduced systematically. For many of the people who did not survive, the **age** they had or the **cabin** they were traveling in, could not be established. The people who survived could be otherwise asked for that information.\n#\n# Can we infer this by looking at the data?\n#\n# In a situation like this, we could expect a greater number of missing values for people who did not survive.\n#\n# Let's have a look.\n\n#%%\n\n# let's create a binary variable that indicates\n# whether the value of cabin is missing\n\ndata['cabin_null'] = np.where(data['cabin'].isnull(), 1, 0)\n\n#%%\n\n# let's evaluate the percentage of missing values in\n# cabin for the people who survived vs the non-survivors.\n\n# the variable Survived takes the value 1 if the passenger\n# survived, or 0 otherwise\n\n# group data by Survived vs Non-Survived\n# and find the percentage of nulls for cabin\ndata.groupby(['survived'])['cabin_null'].mean()\n\n#%%\n\n# another way of doing the above, with less lines\n# of code :)\n\ndata['cabin'].isnull().groupby(data['survived']).mean()\n\n\n# We observe that the percentage of missing values is higher for people who did not survive (87%), respect to people who survived (60%). This finding is aligned with our hypothesis that the data is missing because after people died, the information could not be retrieved.\n#\n# **Note**: Having said this, to truly underpin whether the data is missing not at random, we would need to get extremely familiar with the way data was collected. Analysing datasets, can only point us in the right direction or help us build assumptions.\n\n#%%\n\n# Let's do the same for the variable age:\n\n# First we create a binary variable to indicates\n# whether the value of Age is missing\n\ndata['age_null'] = np.where(data['age'].isnull(), 1, 0)\n\n# and then look at the mean in the different survival groups:\ndata.groupby(['survived'])['age_null'].mean()\n\n#%%\n\n# or the same with simpler code :)\n\ndata['age'].isnull().groupby(data['survived']).mean()\n\n\n# Again, we observe a higher number of missing data for the people who did not survive the tragedy. The analysis therefore suggests that there is a systematic loss of data: people who did not survive tend to have more missing information. Presumably, the method chosen to gather the information, contributes to the generation of these missing data.\n\n# ### Missing data Completely At Random (MCAR)\n\n#%%\n\n# In the titanic dataset, there are also missing values\n# for the variable Embarked.\n# Let's have a look.\n\n# Let's slice the dataframe to show only the observations\n# with missing values for Embarked\n\ndata[data['embarked'].isnull()]\n\n\n# These 2 women were traveling together, Miss Icard was the maid of Mrs Stone.\n#\n# A priori, there does not seem to be an indication that the missing information in the variable Embarked is depending on any other variable, and the fact that these women survived, means that they could have been asked for this information.\n#\n# Very likely the values were lost at the time of building the dataset.\n#\n# If these values are MCAR, the probability of data being missing for these 2 women is the same as the probability for values to missing for any other person on the titanic. Of course this will be hard, if possible at all, to prove. But I hope this serves as a demonstration.\n\n# ### Missing data at Random (MAR)\n#\n# For this example, I will use the Lending Club loan book. I will look at the variables employer name (emp_title) and years in employment (emp_length), both declared by the borrowers at the time of applying for a loan. emp_title refers to the name of the company for which the borrower works. emp_length refers to how many years the borrower has worked for the company mentioned in emp_title. In this example, data missing in emp_title is associated with undeclared length of work in emp_length.\n\n#%%\n\n# let's load the columns of interest from the\n# Lending Club loan book dataset\n\n##########################################\n# Note: newer versions of pandas automatically cast strings as NA,\n# so to follow along with the notebook load the data as below if using\n# the latest pandas version. Loading method may need to be adjusted if\n# using older versions of pandas\n##########################################\n\ndata = pd.read_csv('../loan.csv',\n usecols=['emp_title', 'emp_length'],\n na_values='',\n keep_default_na=False)\ndata.head()\n\n#%%", "target_code": "data.isnull().mean()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Missing Values\n#\n# Missing data, or missing values, occur when __no data__ / __no value__ is stored for certain observations within a variable.\n#\n# Incomplete data is an unavoidable problem in most data sources, and may have a significant impact on the conclusions that can be derived from the data.\n#\n# ### Why is data missing?\n#\n# The source of missing data can be very different. These are just a few examples:\n#\n# - A value is missing because it was forgotten, lost or not stored properly\n# - For a certain observation, the value does not exist\n# - The value can't be known or identified\n#\n# In many organisations, information is collected into a form by a person talking with a client on the phone, or alternatively, by customers filling forms online. Often, the person entering the data does not complete all the fields in the form. Many of the fields are not compulsory, which may lead to missing values.\n#\n# The reasons for omitting the information can vary: perhaps the person does not want to disclose some information, for example income, or they do not know the answer, or the answer is not applicable for a certain circumstance, or on the contrary, the person in the organisation wants to spare the customer some time, and therefore omits asking questions they think are not so relevant.\n#\n# There are other cases where the value for a certain variable does not exist. For example, in the variable 'total debt as percentage of total income' (very common in financial data), if the person has no income, then the total percentage of 0 does not exist, and therefore it will be a missing value.\n#\n# It is important to understand **how the missing data are introduced in the dataset**, that is, the **mechanisms** by which missing information is introduced in a dataset. Depending on the mechanism, we may choose to process the missing values differently. In addition, by knowing the source of missing data, we may choose to take action to control that source and decrease the amount of missing information looking forward during data collection.\n#\n#\n# ### Missing Data Mechanisms\n#\n# There are 3 mechanisms that lead to missing data, 2 of them involve missing data randomly or almost-randomly, and the third one involves a systematic loss of data.\n#\n# #### Missing Completely at Random, MCAR:\n#\n# A variable is missing completely at random (MCAR) if the probability of being missing is the same for all the observations.\n# When data is MCAR, there is absolutely no relationship between the data missing and any other values, observed or missing, within the dataset. In other words, those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than other. If values for observations are missing completely at random, then disregarding those cases would not bias the inferences made.\n#\n#\n# #### Missing at Random, MAR:\n#\n# MAR occurs when there is a relationship between the propensity of missing values and the observed data. In other words, the probability of an observation being missing depends on available information (i.e., other variables in the dataset). For example, if men are more likely to disclose their weight than women, weight is MAR. The weight information will be missing at random for those men and women who do not disclose their weight, but as men are more prone to disclose it, there will be more missing values for women than for men.\n#\n# In a situation like the above, if we decide to proceed with the variable with missing values (in this case weight), we might benefit from including gender to control the bias in weight for the missing observations.\n#\n#\n# #### Missing Not at Random, MNAR:\n#\n# Missing data is not at random (MNAR) when there is a mechanism or a reason why missing values are introduced in the dataset. For example, MNAR would occur if people failed to fill in a depression survey because of their level of depression. Here, the missing of data is related to the outcome, depression. Similarly, when a financial company asks for bank and identity documents from customers in order to prevent identity fraud, typically, fraudsters impersonating someone else will not upload documents, because they don't have them, because they are fraudsters. Therefore, there is a systematic relationship between the missing documents and the target we want to predict: fraud.\n#\n# Understanding the mechanism by which data is missing is important to decide which methods to use to impute the missing values.\n#\n# ====================================================================================================\n\n# ## In this Demo:\n#\n# In the following cells we will:\n#\n# - Learn how to detect and quantify missing values\n#\n# - Try to identify the 3 different mechanisms of missing data introduction\n#\n# We will use the Loan book data from Lending Club and the Titanic dataset.\n#\n# - To download the datasets, please refer to the **Datasets** lecture in **Section 1** of the course.\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n# to display the total number columns present in the dataset\npd.set_option('display.max_columns', None)\n\n\n# let's load the titanic dataset\ndata = pd.read_csv('../titanic.csv')\n\n# let's inspect the first 5 rows\ndata.head()\n\n\n# In python, the missing values are stored as NaN, see for example the first row for the variable Cabin.\n\n\n# we can quantify the total number of missing values using\n# the isnull method plus the sum method on the dataframe\n\ndata.isnull().sum()\n\n\n# There are 263 missing values for Age, 1014 for Cabin and 2 for Embarked.\n\n\n# alternatively, we can use the mean method after isnull\n# to visualise the percentage of\n# missing values for each variable\n\ndata.isnull().mean()\n\n\n# There are missing data in the variables Age (20% missing), Cabin -in which the passenger was traveling- (77% missing), and Embarked -the port from which the passenger got into the Titanic- (~0.2% missing).\n\n# ## Mechanisms of Missing Data\n#\n# ### Missing data Not At Random (MNAR): Systematic missing values\n#\n# In the Titanic dataset, both the missing values of the variables **age** and **cabin**, were introduced systematically. For many of the people who did not survive, the **age** they had or the **cabin** they were traveling in, could not be established. The people who survived could be otherwise asked for that information.\n#\n# Can we infer this by looking at the data?\n#\n# In a situation like this, we could expect a greater number of missing values for people who did not survive.\n#\n# Let's have a look.\n\n\n# let's create a binary variable that indicates\n# whether the value of cabin is missing\n\ndata['cabin_null'] = np.where(data['cabin'].isnull(), 1, 0)\n\n\n# let's evaluate the percentage of missing values in\n# cabin for the people who survived vs the non-survivors.\n\n# the variable Survived takes the value 1 if the passenger\n# survived, or 0 otherwise\n\n# group data by Survived vs Non-Survived\n# and find the percentage of nulls for cabin\ndata.groupby(['survived'])['cabin_null'].mean()\n\n\n# another way of doing the above, with less lines\n# of code :)\n\ndata['cabin'].isnull().groupby(data['survived']).mean()\n\n\n# We observe that the percentage of missing values is higher for people who did not survive (87%), respect to people who survived (60%). This finding is aligned with our hypothesis that the data is missing because after people died, the information could not be retrieved.\n#\n# **Note**: Having said this, to truly underpin whether the data is missing not at random, we would need to get extremely familiar with the way data was collected. Analysing datasets, can only point us in the right direction or help us build assumptions.\n\n\n# Let's do the same for the variable age:\n\n# First we create a binary variable to indicates\n# whether the value of Age is missing\n\ndata['age_null'] = np.where(data['age'].isnull(), 1, 0)\n\n# and then look at the mean in the different survival groups:\ndata.groupby(['survived'])['age_null'].mean()\n\n\n# or the same with simpler code :)\n\ndata['age'].isnull().groupby(data['survived']).mean()\n\n\n# Again, we observe a higher number of missing data for the people who did not survive the tragedy. The analysis therefore suggests that there is a systematic loss of data: people who did not survive tend to have more missing information. Presumably, the method chosen to gather the information, contributes to the generation of these missing data.\n\n# ### Missing data Completely At Random (MCAR)\n\n\n# In the titanic dataset, there are also missing values\n# for the variable Embarked.\n# Let's have a look.\n\n# Let's slice the dataframe to show only the observations\n# with missing values for Embarked\n\ndata[data['embarked'].isnull()]\n\n\n# These 2 women were traveling together, Miss Icard was the maid of Mrs Stone.\n#\n# A priori, there does not seem to be an indication that the missing information in the variable Embarked is depending on any other variable, and the fact that these women survived, means that they could have been asked for this information.\n#\n# Very likely the values were lost at the time of building the dataset.\n#\n# If these values are MCAR, the probability of data being missing for these 2 women is the same as the probability for values to missing for any other person on the titanic. Of course this will be hard, if possible at all, to prove. But I hope this serves as a demonstration.\n\n# ### Missing data at Random (MAR)\n#\n# For this example, I will use the Lending Club loan book. I will look at the variables employer name (emp_title) and years in employment (emp_length), both declared by the borrowers at the time of applying for a loan. emp_title refers to the name of the company for which the borrower works. emp_length refers to how many years the borrower has worked for the company mentioned in emp_title. In this example, data missing in emp_title is associated with undeclared length of work in emp_length.\n\n\n# let's load the columns of interest from the\n# Lending Club loan book dataset\n\n##########################################\n# Note: newer versions of pandas automatically cast strings as NA,\n# so to follow along with the notebook load the data as below if using\n# the latest pandas version. Loading method may need to be adjusted if\n# using older versions of pandas\n##########################################\n\ndata = pd.read_csv('../loan.csv',\n usecols=['emp_title', 'emp_length'],\n na_values='',\n keep_default_na=False)\ndata.head()\n\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "intent": "# percentage of missing data"}, {"original_comment": "# ### With 3-fold Validation\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom sklearn import cross_validation, datasets, svm\nfrom sklearn import cross_validation\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn import datasets\nimport numpy as np\nfrom sklearn import datasets, svm\nnp.random.seed(0)\n\n\n# # Cross-Validation\n\n# Cross validation is a way of evaluating a model on a dataset. It provides an estimation of the accuracy of the model if it were to make predictions on previously unseen data. Cross validation estimations are used to aid in the selection of a robust model that is fit for purpose.\n\n# How does it work?\n\n# * Test-train Split\n# * n-fold cross-validation\n\n# ## From Scratch\n\n# ### Straight-forward\n\n#%%\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\nsvc = svm.SVC(C=1, kernel='linear')\nsvc.fit(X_digits[:-100], y_digits[:-100]\n ).score(X_digits[-100:], y_digits[-100:])", "target_code": "X_folds = np.array_split(X_digits, 3)\ny_folds = np.array_split(y_digits, 3)\nscores = list()\nfor k in range(3):\n X_train = list(X_folds)\n X_test = X_train.pop(k)\n X_train = np.concatenate(X_train)\n y_train = list(y_folds)\n y_test = y_train.pop(k)\n y_train = np.concatenate(y_train)\n scores.append(svc.fit(X_train, y_train).score(X_test, y_test))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport matplotlib.pyplot as plt\nfrom sklearn import cross_validation, datasets, svm\nfrom sklearn import cross_validation\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn import datasets\nimport numpy as np\nfrom sklearn import datasets, svm\nnp.random.seed(0)\n\n\n# # Cross-Validation\n\n# Cross validation is a way of evaluating a model on a dataset. It provides an estimation of the accuracy of the model if it were to make predictions on previously unseen data. Cross validation estimations are used to aid in the selection of a robust model that is fit for purpose.\n\n# How does it work?\n\n# * Test-train Split\n# * n-fold cross-validation\n\n# ## From Scratch\n\n# ### Straight-forward\n\n\ndigits = datasets.load_digits()\nX_digits = digits.data\ny_digits = digits.target\nsvc = svm.SVC(C=1, kernel='linear')\nsvc.fit(X_digits[:-100], y_digits[:-100]\n ).score(X_digits[-100:], y_digits[-100:])\n\n\n\n", "project_metadata": {"full_name": "soumendra/lecture_bdap2015", "description": "Lecture Notes and Class Assignments for the Big Data and Analytics Program, 2015.", "topics": [], "git_url": "git://github.com/soumendra/lecture_bdap2015.git", "stars": 7, "watchers": 7, "forks": 29, "created": "2015-11-03T01:35:38Z", "size": 4076, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2938983, "R": 2126}, "last_updated": "2019-01-06T17:53:47Z"}, "intent": "# 3-fold Validation"}, {"original_comment": "# Plot the actual sample to verify\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport tensorflow as tf\nimport keras\nfrom keras.datasets import mnist\nfrom keras.utils import np_utils\nfrom keras import regularizers\nfrom keras.layers import Flatten, Dense, Dropout\nfrom keras.models import Sequential\nimport numpy as np\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n#! pip install --upgrade numpy --user\n\n#%%\n\n# Version check\n\n\nprint('NumPy version :', np.__version__)\nprint('Keras version', keras.__version__)\nprint('With tensorflow backend, version : ', tf.__version__)\n\n#%%\n\n# Load data\n\n(x_train, y_train), (x_test, y_test) = mnist.load_data()\n\n#%%\n\n# Check type and dimensions\n\ntype(x_train[0]), x_train.shape, y_train.shape\n\n#%%\n\n# Plot out an example\n\nplt.imshow(x_train[0], cmap=plt.cm.binary)\n\n#%%\n\n# Normalize data\n\n# scaling the pixel values that are between 0-225\nx_train = keras.utils.normalize(x_train, axis=1)\nx_test = keras.utils.normalize(x_test, axis=1)\n# experiment without normalization, makes difference in learning\n\n#%%\n\n# Plot normalized\n\nplt.imshow(x_train[0], cmap=plt.cm.binary)\n\n#%%\n\n# Build a model : feed forward net\n\nmodel = Sequential()\n\nmodel.add(Flatten(input_shape=(28, 28,)))\n# , kernel_initializer='glorot_uniform', bias_initializer='zeros'))\nmodel.add(Dense(1024, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))\nmodel.add(Dense(28, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))\nmodel.add(Dense(10, activation='softmax'))\n\n#%%\n\n# compile model\n\nmodel.compile(optimizer='adam', # 'sgd'\n loss='sparse_categorical_crossentropy',\n metrics=['accuracy'])\n\n#%%\n\n# View model config\n\nmodel.summary()\n\n#%%\n\n# Initiate training session\n\n# other arguments validation split=0.33, batch size=10\nmodel.fit(x_train, y_train, epochs=5, batch_size=100)\n\n#%%\n\n# Evaluate on test set\n\nmodel.evaluate(x_test, y_test)\n\n#%%\n\n# Save model\n\nmodel.save('mnist_nn.model')\n\n#%%\n\n# Load model\n\nload_model = keras.models.load_model(\n 'mnist_nn.model') # nesting & function argument\n\n#%%\n\n# use the inference graph generated by the model to predict class labels on our test set\n\npredictions = load_model.predict([x_test])\n\n#%%\n\n# Choose individuaL sample from test set\nsample_index = 1\n\n# print maximum predicticted value sample\nprint('Network thinks it is seeing the number : |',\n np.argmax(predictions[sample_index]), '|')\n\n#%%", "target_code": "plt.imshow(x_test[sample_index])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport tensorflow as tf\nimport keras\nfrom keras.datasets import mnist\nfrom keras.utils import np_utils\nfrom keras import regularizers\nfrom keras.layers import Flatten, Dense, Dropout\nfrom keras.models import Sequential\nimport numpy as np\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n#! pip install --upgrade numpy --user\n\n\n# Version check\n\n\nprint('NumPy version :', np.__version__)\nprint('Keras version', keras.__version__)\nprint('With tensorflow backend, version : ', tf.__version__)\n\n\n# Load data\n\n(x_train, y_train), (x_test, y_test) = mnist.load_data()\n\n\n# Check type and dimensions\n\ntype(x_train[0]), x_train.shape, y_train.shape\n\n\n# Plot out an example\n\nplt.imshow(x_train[0], cmap=plt.cm.binary)\n\n\n# Normalize data\n\n# scaling the pixel values that are between 0-225\nx_train = keras.utils.normalize(x_train, axis=1)\nx_test = keras.utils.normalize(x_test, axis=1)\n# experiment without normalization, makes difference in learning\n\n\n# Plot normalized\n\nplt.imshow(x_train[0], cmap=plt.cm.binary)\n\n\n# Build a model : feed forward net\n\nmodel = Sequential()\n\nmodel.add(Flatten(input_shape=(28, 28,)))\n# , kernel_initializer='glorot_uniform', bias_initializer='zeros'))\nmodel.add(Dense(1024, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))\nmodel.add(Dense(28, kernel_regularizer=regularizers.l2(0.0001), activation='relu'))\nmodel.add(Dense(10, activation='softmax'))\n\n\n# compile model\n\nmodel.compile(optimizer='adam', # 'sgd'\n loss='sparse_categorical_crossentropy',\n metrics=['accuracy'])\n\n\n# View model config\n\nmodel.summary()\n\n\n# Initiate training session\n\n# other arguments validation split=0.33, batch size=10\nmodel.fit(x_train, y_train, epochs=5, batch_size=100)\n\n\n# Evaluate on test set\n\nmodel.evaluate(x_test, y_test)\n\n\n# Save model\n\nmodel.save('mnist_nn.model')\n\n\n# Load model\n\nload_model = keras.models.load_model(\n 'mnist_nn.model') # nesting & function argument\n\n\n# use the inference graph generated by the model to predict class labels on our test set\n\npredictions = load_model.predict([x_test])\n\n\n# Choose individuaL sample from test set\nsample_index = 1\n\n# print maximum predicticted value sample\nprint('Network thinks it is seeing the number : |',\n np.argmax(predictions[sample_index]), '|')\n\n\n", "project_metadata": {"full_name": "PacktPublishing/Hands-On-Neural-Networks-with-Keras", "description": "Hands-On Neural Networks with Keras, published by Packt", "topics": [], "git_url": "git://github.com/PacktPublishing/Hands-On-Neural-Networks-with-Keras.git", "stars": 13, "watchers": 13, "forks": 6, "created": "2019-01-14T06:16:17Z", "size": 18356, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 26882646}, "last_updated": "2020-12-18T21:19:47Z"}, "intent": "# Plot the actual sample"}, {"original_comment": "# Inspect top 10 rows of the prepared market data:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 03 - \"Financial Data Science: Moving Average Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **trend-following** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **trend-following trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport bt as bt # library to backtest trading signals\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport pandas_datareader as dr\nimport numpy as np\nimport pandas as pd\nimport itertools as it\nimport datetime as dt\nimport os as os\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Suppress potential warnings due to recent library enhancements:\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import python utility libraries\n\n# import python data science libraries\n\n# import the pandas financial data reader library\n\n# import the matplotlib and seaborn visualization library\n\n\n# Install the Python `BT` backtesting library:\n\n#%%\n\nget_ipython().system('pip install bt')\n\n\n# Upon successful installation let's import the Python `BT` backtesting library:\n\n#%%\n\n# Let's also set a couple of general plot parameters:\n\n#%%\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n#%%\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download market price data starting from the **31.12.1990** until the **31.12.2017** to develop and evaluate a simple momentum trading strategy:\n\n#%%\n\n# set to start and end date of the data download\nstart_date = dt.datetime(1990, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"International Business Machines\" (IBM) market data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n#%%\n\n# download ibm market data\nibm_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `IBM` data downloaded:\n\n#%%\n\nibm_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `IBM` data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot ibm stock daily adjusted closing prices\nax.plot(ibm_data.index, ibm_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines Corporation (IBM) - Historical Stock Prices', fontsize=10)\n\n\n# Save the downloaded `IBM` data to the local directory:\n\n#%%\n\nibm_data.to_csv('./datasets/ibm_data_1990_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Moving Average Crossover Strategy Implementation\n\n# Let's implement a simple **Moving Average Crossover** trading strategy. In general, **crossover trading** refers to the idea that changes of market situations can be determined based on price \u201cbreakouts\u201d. A crossover can be interpreted as another measure of a financial instruments momentum.\u00a0In the past crossover signals have been extensively used to determine that it\u2019s time to either buy or sell the underlying asset.\n#\n# The price crossover signals of a simple **Moving Average Crossover** trading strategy are triggered by the following events:\n#\n# >- Generate a **short** trading signal once the price of a financial instrument drops below the general price trend, e.g., 100-days moving average band (\"Sell Sign Crossover\", left image below).\n# >- Generate a **long** trading signal once the price of a financial instrument exceeds the general price trend, e.g., 100-days moving average band (\"Buy Sign Crossover\", right image below).\n\n# \n\n# An enhancement of the **Moving Average Crossover** is to apply two moving averages to a chart: one long running moving average (e.g., a 200-days SMAV) and one short running moving average (e.g., 20-days SMAV). Once the short running moving average crosses above the long running moving average a **Buy** or **Long** signal is triggered, as it indicates that the trend is shifting up (this is known as a \"golden cross\"). On the other hand, when the short running moving average crosses below the long running moving average, a **Sell** or **Short** signal is triggered, as it indicates that the trend is shifting down (his is known as a \"dead/death cross\")\n\n# Let's start implementing this enhanced trading strategy by setting the distinct moving average window sizes that specify the number of historical daily adjusted closing prices of the IBM stock to be considered in the calculation of the rolling moving average:\n\n#%%\n\n# set \"fast\" short-running moving average indicator lookback, days = 15\ncross_mav_days_15 = 15\n# set \"slow\" short-running moving average indicator lookback, days = 60\ncross_mav_days_60 = 60\n# set \"trend\" long-running moving average indicator lookback, days = 200\ncross_mav_days_200 = 200\n\n\n# Calculate the rolling moving averages of window sizes: 15 days, 50 days and 200 days. In general the **\"Simple Moving Average (SMAV)\"** of a financial instrument $i$ (e.g., a stock, commodity, fx-rate) is defined as the mean of the previous $n$ prices, formally denoted by:\n#\n\n# $$SMA_{i}(t)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$$\n\n# were $t$ denotes the current point in time and $n$ the lookback.\n\n# We can calculate the SMAV by just using the Pandas `rolling()` and `mean()`function:\n\n#%%\n\ncross_mav_15 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_15).mean(), name='SMAV_15')\ncross_mav_60 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_60).mean(), name='SMAV_60')\ncross_mav_200 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_200).mean(), name='SMAV_200')\n\n\n# Merge the rolling moving average values with the original market data (adjusted closing prices):\n\n#%%\n\ncross_mav_ibm_data = ibm_data.join(cross_mav_15)\ncross_mav_ibm_data = cross_mav_ibm_data.join(cross_mav_60)\ncross_mav_ibm_data = cross_mav_ibm_data.join(cross_mav_200)\n\n\n# Inspect and validate the daily adjusted closing prices of the IBM stock as well as the derived moving average values starting from the first obtained 200-day moving average market price:\n\n#%%\n\ncross_mav_ibm_data[['Adj Close', 'SMAV_15',\n 'SMAV_60', 'SMAV_200']].iloc[200:210]\n\n\n# Plot the historical daily adjusted closing prices of the IBM stock (blue) as well as its 15 days (green), 60 days (red) as well as 200 days (yellow) rolling moving averages:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot adjusted closing prices and moving averages\nax.plot(cross_mav_ibm_data['Adj Close'], lw=1.0,\n color='#9b59b6', label='Closing Prices (purple)')\nax.plot(cross_mav_ibm_data['SMAV_15'], color='C1',\n lw=1.0, label='15-day MAV (green)')\nax.plot(cross_mav_ibm_data['SMAV_60'], color='C1',\n lw=1.0, label='60-day MAV (red)')\nax.plot(cross_mav_ibm_data['SMAV_200'], color='C4',\n lw=1.0, label='200-day MAV (yellow)')\n\n# rotate x-tick labels\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines Corporation (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# ### 4. Moving Average Crossover Signal Generation\n\n# Derive trading signals from of two distinct moving average crossover trading strategy configurations. We will generate a **long-signal** (+1.0) for the time intervals where the fast moving averages are above the 200-day moving average. In addition we generate a **short-signal** (-1.0) for the time intervals where the fast moving averages are below the 200-day moving average:\n\n#%%\n\n# create 'fast' trend-following signals\ncross_mav_ibm_data['SIGNAL_15'] = 0.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_15'] >\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_15'] = 1.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_15'] <\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_15'] = -1.0\n\n# create 'slow' trend-following signals\ncross_mav_ibm_data['SIGNAL_60'] = 0.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_60'] >\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_60'] = 1.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_60'] <\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_60'] = -1.0\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n#%%\n\ncross_mav_ibm_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the long-/short-term moving-average trading strategy:\n\n#%%\n\n# convert signals to Pandas DataFrame\ncross_mav_ibm_signal_data = pd.DataFrame(cross_mav_ibm_data[[\n 'SIGNAL_15', 'SIGNAL_60', 'SIGNAL_BASE']], columns=['SIGNAL_15', 'SIGNAL_60', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\ncross_mav_ibm_signal_data = cross_mav_ibm_signal_data.set_index(\n pd.to_datetime(ibm_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n#%%\n\ncross_mav_ibm_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the 15-days and 60-days crossover moving average trading strategies:\n\n#%%\n\ncross_mav_ibm_signal_data[cross_mav_ibm_signal_data['SIGNAL_15']\n != cross_mav_ibm_signal_data['SIGNAL_60']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\nax[0].plot(cross_mav_ibm_signal_data['SIGNAL_15'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(cross_mav_ibm_signal_data['SIGNAL_60'],\n lw=1.0, color='C1', label='SMAV 60 (green)')\nax[2].plot(cross_mav_ibm_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[smav 15 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[smav 60 signal]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'International Business Machines Corporation (IBM) - 15 days Crossover Moving Average Trading Signals', fontsize=10)\nax[1].set_title(\n 'International Business Machines Corporation (IBM) - 60 days Crossover Moving Average Trading Signals', fontsize=10)\nax[2].set_title(\n 'International Business Machines Corporation (IBM) - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n#%%\n\n# signal changes of the 15-200 days moving average crossover trading strategy\nlen(list(it.groupby(cross_mav_ibm_signal_data['SIGNAL_15'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the 60-200 days moving average crossover trading strategy\nlen(list(it.groupby(cross_mav_ibm_signal_data['SIGNAL_60'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n cross_mav_ibm_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Moving Average Crossover Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the crossover moving average trading strategy configurations:\n\n#%%\n\n# extract the ibm stock closing prices\nibm_market_data = pd.DataFrame(ibm_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'ibm' (since this is the column we want to allocate to in the backtest)\nibm_market_data = ibm_market_data.rename(columns={'Adj Close': 'IBM'})\n\n# convert pandas DataFrame index to datatype: datetime\nibm_market_data = ibm_market_data.set_index(pd.to_datetime(ibm_data.index))", "target_code": "ibm_market_data.head(10)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 03 - \"Financial Data Science: Moving Average Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **trend-following** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **trend-following trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n\nimport bt as bt # library to backtest trading signals\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport pandas_datareader as dr\nimport numpy as np\nimport pandas as pd\nimport itertools as it\nimport datetime as dt\nimport os as os\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Suppress potential warnings due to recent library enhancements:\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n\n# import python utility libraries\n\n# import python data science libraries\n\n# import the pandas financial data reader library\n\n# import the matplotlib and seaborn visualization library\n\n\n# Install the Python `BT` backtesting library:\n\n\nget_ipython().system('pip install bt')\n\n\n# Upon successful installation let's import the Python `BT` backtesting library:\n\n\n# Let's also set a couple of general plot parameters:\n\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download market price data starting from the **31.12.1990** until the **31.12.2017** to develop and evaluate a simple momentum trading strategy:\n\n\n# set to start and end date of the data download\nstart_date = dt.datetime(1990, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"International Business Machines\" (IBM) market data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n\n# download ibm market data\nibm_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `IBM` data downloaded:\n\n\nibm_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `IBM` data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot ibm stock daily adjusted closing prices\nax.plot(ibm_data.index, ibm_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines Corporation (IBM) - Historical Stock Prices', fontsize=10)\n\n\n# Save the downloaded `IBM` data to the local directory:\n\n\nibm_data.to_csv('./datasets/ibm_data_1990_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Moving Average Crossover Strategy Implementation\n\n# Let's implement a simple **Moving Average Crossover** trading strategy. In general, **crossover trading** refers to the idea that changes of market situations can be determined based on price \u201cbreakouts\u201d. A crossover can be interpreted as another measure of a financial instruments momentum.\u00a0In the past crossover signals have been extensively used to determine that it\u2019s time to either buy or sell the underlying asset.\n#\n# The price crossover signals of a simple **Moving Average Crossover** trading strategy are triggered by the following events:\n#\n# >- Generate a **short** trading signal once the price of a financial instrument drops below the general price trend, e.g., 100-days moving average band (\"Sell Sign Crossover\", left image below).\n# >- Generate a **long** trading signal once the price of a financial instrument exceeds the general price trend, e.g., 100-days moving average band (\"Buy Sign Crossover\", right image below).\n\n# \n\n# An enhancement of the **Moving Average Crossover** is to apply two moving averages to a chart: one long running moving average (e.g., a 200-days SMAV) and one short running moving average (e.g., 20-days SMAV). Once the short running moving average crosses above the long running moving average a **Buy** or **Long** signal is triggered, as it indicates that the trend is shifting up (this is known as a \"golden cross\"). On the other hand, when the short running moving average crosses below the long running moving average, a **Sell** or **Short** signal is triggered, as it indicates that the trend is shifting down (his is known as a \"dead/death cross\")\n\n# Let's start implementing this enhanced trading strategy by setting the distinct moving average window sizes that specify the number of historical daily adjusted closing prices of the IBM stock to be considered in the calculation of the rolling moving average:\n\n\n# set \"fast\" short-running moving average indicator lookback, days = 15\ncross_mav_days_15 = 15\n# set \"slow\" short-running moving average indicator lookback, days = 60\ncross_mav_days_60 = 60\n# set \"trend\" long-running moving average indicator lookback, days = 200\ncross_mav_days_200 = 200\n\n\n# Calculate the rolling moving averages of window sizes: 15 days, 50 days and 200 days. In general the **\"Simple Moving Average (SMAV)\"** of a financial instrument $i$ (e.g., a stock, commodity, fx-rate) is defined as the mean of the previous $n$ prices, formally denoted by:\n#\n\n# $$SMA_{i}(t)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$$\n\n# were $t$ denotes the current point in time and $n$ the lookback.\n\n# We can calculate the SMAV by just using the Pandas `rolling()` and `mean()`function:\n\n\ncross_mav_15 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_15).mean(), name='SMAV_15')\ncross_mav_60 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_60).mean(), name='SMAV_60')\ncross_mav_200 = pd.Series(ibm_data['Adj Close'].rolling(\n window=cross_mav_days_200).mean(), name='SMAV_200')\n\n\n# Merge the rolling moving average values with the original market data (adjusted closing prices):\n\n\ncross_mav_ibm_data = ibm_data.join(cross_mav_15)\ncross_mav_ibm_data = cross_mav_ibm_data.join(cross_mav_60)\ncross_mav_ibm_data = cross_mav_ibm_data.join(cross_mav_200)\n\n\n# Inspect and validate the daily adjusted closing prices of the IBM stock as well as the derived moving average values starting from the first obtained 200-day moving average market price:\n\n\ncross_mav_ibm_data[['Adj Close', 'SMAV_15',\n 'SMAV_60', 'SMAV_200']].iloc[200:210]\n\n\n# Plot the historical daily adjusted closing prices of the IBM stock (blue) as well as its 15 days (green), 60 days (red) as well as 200 days (yellow) rolling moving averages:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot adjusted closing prices and moving averages\nax.plot(cross_mav_ibm_data['Adj Close'], lw=1.0,\n color='#9b59b6', label='Closing Prices (purple)')\nax.plot(cross_mav_ibm_data['SMAV_15'], color='C1',\n lw=1.0, label='15-day MAV (green)')\nax.plot(cross_mav_ibm_data['SMAV_60'], color='C1',\n lw=1.0, label='60-day MAV (red)')\nax.plot(cross_mav_ibm_data['SMAV_200'], color='C4',\n lw=1.0, label='200-day MAV (yellow)')\n\n# rotate x-tick labels\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines Corporation (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# ### 4. Moving Average Crossover Signal Generation\n\n# Derive trading signals from of two distinct moving average crossover trading strategy configurations. We will generate a **long-signal** (+1.0) for the time intervals where the fast moving averages are above the 200-day moving average. In addition we generate a **short-signal** (-1.0) for the time intervals where the fast moving averages are below the 200-day moving average:\n\n\n# create 'fast' trend-following signals\ncross_mav_ibm_data['SIGNAL_15'] = 0.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_15'] >\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_15'] = 1.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_15'] <\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_15'] = -1.0\n\n# create 'slow' trend-following signals\ncross_mav_ibm_data['SIGNAL_60'] = 0.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_60'] >\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_60'] = 1.0\ncross_mav_ibm_data.loc[cross_mav_ibm_data['SMAV_60'] <\n cross_mav_ibm_data['SMAV_200'], 'SIGNAL_60'] = -1.0\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n\ncross_mav_ibm_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the long-/short-term moving-average trading strategy:\n\n\n# convert signals to Pandas DataFrame\ncross_mav_ibm_signal_data = pd.DataFrame(cross_mav_ibm_data[[\n 'SIGNAL_15', 'SIGNAL_60', 'SIGNAL_BASE']], columns=['SIGNAL_15', 'SIGNAL_60', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\ncross_mav_ibm_signal_data = cross_mav_ibm_signal_data.set_index(\n pd.to_datetime(ibm_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n\ncross_mav_ibm_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the 15-days and 60-days crossover moving average trading strategies:\n\n\ncross_mav_ibm_signal_data[cross_mav_ibm_signal_data['SIGNAL_15']\n != cross_mav_ibm_signal_data['SIGNAL_60']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\nax[0].plot(cross_mav_ibm_signal_data['SIGNAL_15'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(cross_mav_ibm_signal_data['SIGNAL_60'],\n lw=1.0, color='C1', label='SMAV 60 (green)')\nax[2].plot(cross_mav_ibm_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[smav 15 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[smav 60 signal]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'International Business Machines Corporation (IBM) - 15 days Crossover Moving Average Trading Signals', fontsize=10)\nax[1].set_title(\n 'International Business Machines Corporation (IBM) - 60 days Crossover Moving Average Trading Signals', fontsize=10)\nax[2].set_title(\n 'International Business Machines Corporation (IBM) - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n\n# signal changes of the 15-200 days moving average crossover trading strategy\nlen(list(it.groupby(cross_mav_ibm_signal_data['SIGNAL_15'], lambda x: x > 0)))\n\n\n# signal changes of the 60-200 days moving average crossover trading strategy\nlen(list(it.groupby(cross_mav_ibm_signal_data['SIGNAL_60'], lambda x: x > 0)))\n\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n cross_mav_ibm_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Moving Average Crossover Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the crossover moving average trading strategy configurations:\n\n\n# extract the ibm stock closing prices\nibm_market_data = pd.DataFrame(ibm_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'ibm' (since this is the column we want to allocate to in the backtest)\nibm_market_data = ibm_market_data.rename(columns={'Adj Close': 'IBM'})\n\n# convert pandas DataFrame index to datatype: datetime\nibm_market_data = ibm_market_data.set_index(pd.to_datetime(ibm_data.index))\n\n\n\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "intent": "# Inspect top 10 rows"}, {"original_comment": "# Select two rows and two columns:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Focusing only on [], .loc, and .iloc\n# #### There are many ways to select subsets of data, but in this article we will only cover the usage of the square brackets ([]), .loc and .iloc. Collectively, they are called the indexers. These are by far the most common ways to select data. A different part of this Series will discuss a few methods that can be used to make subset selections.\n#\n# #### If you have a DataFrame, df, your subset selection will look something like the following:\n#\n# ### df[ ]\n# ### df.loc[ ]\n# ### df.iloc[ ]\n# #### A real subset selection will have something inside of the square brackets. All selections in this article will take place inside of those square brackets.\n#\n# #### Notice that the square brackets also follow .loc and .iloc. All indexing in Python happens inside of these square brackets.\n\n#%%\n\nimport pandas as pd\ndf = pd.read_csv('movie_sample_data.csv', index_col=0)\ndf\n\n#%%\n\nindex = df.index\ncolumns = df.columns\nvalues = df.values\n\n#%%\n\nindex\n\n#%%\n\ncolumns\n\n#%%\n\nvalues\n\n#%%\n\n# Selecting multiple columns with just the indexing operator\ndf[['state', 'age', 'height', 'score']]\n\n\n# ## Select a single row as a Series with .loc\n\n#%%\n\n# The .loc indexer will return a single row as a Series when given a single row label.\n# Let's select the row for Niko.\ndf.loc['Niko']\n\n\n# ## Select multiple rows as a DataFrame with .loc\n#\n# To select multiple rows, put all the row labels you want to select in a list and pass that to .loc.\n# Let's select Niko and Penelope.\n#\n#\n\n#%%\n\ndf.loc[['Niko', 'Penelope']]\n\n\n# ## Use slice notation to select a range of rows with .loc\n# It is possible to 'slice' the rows of a DataFrame with .loc by using slice notation. Slice notation uses a colon to separate start, stop and step values.\n# For instance we can select all the rows from Niko through Dean like this:\n\n#%%\n\ndf.loc['Niko':'Dean']\n# .loc includes the last value with slice notation\n# Notice that the row labeled with Dean was kept.\n# In other data containers such as Python lists, the last value is excluded.\n\n\n# ## Other slices\n# You can use slice notation similarly to how you use it with lists.\n# Let's slice from the beginning through Aaron:\n\n#%%\n\ndf.loc[:'Aaron']\n\n#%%\n\n# Slice from Niko to Christina stepping by 2:\ndf.loc['Niko':'Christina':2]\n\n#%%\n\ndf.loc['Dean':]\n\n\n# ## Selecting rows and columns simultaneously with .loc\n# Unlike just the indexing operator, it is possible to select rows and columns simultaneously with .loc. You do it by separating your row and column selections by a comma. It will look something like this:\n#\n# df.loc[row_selection, column_selection]\n#\n# ## Select two rows and three columns\n# For instance, if we wanted to select the rows Dean and Cornelia along with the columns age, state and score we would do this:\n\n#%%\n\ndf.loc[['Jane', 'Dean'], ['age', 'height', 'score']]\n\n\n# ## Use any combination of selections for either row or columns for .loc\n# Row or column selections can be any of the following as we have already seen:\n#\n# A single label\n# A list of labels\n# A slice with labels\n# We can use any of these three for either row or column selections with .loc. Let's see some examples.\n#\n# Let's select two rows and a single column:\n\n#%%\n\ndf.loc[['Jane', 'Aaron'], 'food']\n\n#%%\n\n# Select a slice of rows and a list of columns:\ndf.loc['Jane':'Dean', ['food', 'age']]\n\n#%%\n\n# Select a single row and a single column. This returns a scalar value.\ndf.loc['Jane', 'age']\n\n#%%\n\n# Select a slice of rows and columns\ndf.loc[:'Dean', 'age':]\n\n\n# ## Selecting all of the rows and some columns\n# It is possible to select all of the rows by using a single colon.\n# You can then select columns as normal:\n\n#%%\n\ndf.loc[:, ['food', 'color']]\n# df[['food','color']]\n# we can do it by both the ways\n\n#%%\n\n# You can also use this notation to select all of the columns:\ndf.loc['Jane':'Penelope', :]\n\n#%%\n\ndf.loc[['Jane', 'Aaron'], :]\n\n#%%\n\n# But, it isn't necessary as we have seen, so you can leave out that last colon:\ndf.loc[['Jane', 'Aaron']]\n\n\n# ## Assign row and column selections to variables\n# It might be easier to assign row and column selections to variables before you use .loc.\n# This is useful if you are selecting many rows or columns:\n\n#%%\n\nrows = ['Jane', 'Niko', 'Dean', 'Cornelia']\ncols = ['color', 'food', 'age', 'height']\n\n#%%\n\ndf.loc[rows, cols]\n\n\n# ## Summary of .loc\n# Only uses labels\n# Can select rows and columns simultaneously\n# Selection can be a single label, a list of labels or a slice of labels\n# Put a comma between row and column selections\n#\n# # Getting started with .iloc\n# The .iloc indexer is very similar to .loc but only uses integer locations to make its selections.\n# The word .iloc itself stands for integer location so that should help with remember what it does.\n#\n# ## Selecting a single row with .iloc\n# By passing a single integer to .iloc, it will select one row as a Series:\n\n#%%\n\ndf.iloc[0] # it's giving the first tuple\n\n\n# ## Selecting multiple rows with .iloc\n# Use a list of integers to select multiple rows:\n\n#%%\n\ndf.iloc[[5, 2, 4]] # df.iloc[5, 2, 4] Error!\n\n\n# ## Use slice notation to select a range of rows with .iloc\n# Slice notation works just like a list in this instance and is exclusive of the last element\n\n#%%\n\ndf.iloc[3:5]\n\n\n# ## Selecting rows and columns simultaneously with .iloc\n# Just like with .iloc any combination of a single integer, lists of integers or slices can be used\n# to select rows and columns simultaneously. Just remember to separate the selections with a comma.\n#", "target_code": "df.iloc[[2, 4], [2, 4]]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Focusing only on [], .loc, and .iloc\n# #### There are many ways to select subsets of data, but in this article we will only cover the usage of the square brackets ([]), .loc and .iloc. Collectively, they are called the indexers. These are by far the most common ways to select data. A different part of this Series will discuss a few methods that can be used to make subset selections.\n#\n# #### If you have a DataFrame, df, your subset selection will look something like the following:\n#\n# ### df[ ]\n# ### df.loc[ ]\n# ### df.iloc[ ]\n# #### A real subset selection will have something inside of the square brackets. All selections in this article will take place inside of those square brackets.\n#\n# #### Notice that the square brackets also follow .loc and .iloc. All indexing in Python happens inside of these square brackets.\n\n\nimport pandas as pd\ndf = pd.read_csv('movie_sample_data.csv', index_col=0)\ndf\n\n\nindex = df.index\ncolumns = df.columns\nvalues = df.values\n\n\nindex\n\n\ncolumns\n\n\nvalues\n\n\n# Selecting multiple columns with just the indexing operator\ndf[['state', 'age', 'height', 'score']]\n\n\n# ## Select a single row as a Series with .loc\n\n\n# The .loc indexer will return a single row as a Series when given a single row label.\n# Let's select the row for Niko.\ndf.loc['Niko']\n\n\n# ## Select multiple rows as a DataFrame with .loc\n#\n# To select multiple rows, put all the row labels you want to select in a list and pass that to .loc.\n# Let's select Niko and Penelope.\n#\n#\n\n\ndf.loc[['Niko', 'Penelope']]\n\n\n# ## Use slice notation to select a range of rows with .loc\n# It is possible to 'slice' the rows of a DataFrame with .loc by using slice notation. Slice notation uses a colon to separate start, stop and step values.\n# For instance we can select all the rows from Niko through Dean like this:\n\n\ndf.loc['Niko':'Dean']\n# .loc includes the last value with slice notation\n# Notice that the row labeled with Dean was kept.\n# In other data containers such as Python lists, the last value is excluded.\n\n\n# ## Other slices\n# You can use slice notation similarly to how you use it with lists.\n# Let's slice from the beginning through Aaron:\n\n\ndf.loc[:'Aaron']\n\n\n# Slice from Niko to Christina stepping by 2:\ndf.loc['Niko':'Christina':2]\n\n\ndf.loc['Dean':]\n\n\n# ## Selecting rows and columns simultaneously with .loc\n# Unlike just the indexing operator, it is possible to select rows and columns simultaneously with .loc. You do it by separating your row and column selections by a comma. It will look something like this:\n#\n# df.loc[row_selection, column_selection]\n#\n# ## Select two rows and three columns\n# For instance, if we wanted to select the rows Dean and Cornelia along with the columns age, state and score we would do this:\n\n\ndf.loc[['Jane', 'Dean'], ['age', 'height', 'score']]\n\n\n# ## Use any combination of selections for either row or columns for .loc\n# Row or column selections can be any of the following as we have already seen:\n#\n# A single label\n# A list of labels\n# A slice with labels\n# We can use any of these three for either row or column selections with .loc. Let's see some examples.\n#\n# Let's select two rows and a single column:\n\n\ndf.loc[['Jane', 'Aaron'], 'food']\n\n\n# Select a slice of rows and a list of columns:\ndf.loc['Jane':'Dean', ['food', 'age']]\n\n\n# Select a single row and a single column. This returns a scalar value.\ndf.loc['Jane', 'age']\n\n\n# Select a slice of rows and columns\ndf.loc[:'Dean', 'age':]\n\n\n# ## Selecting all of the rows and some columns\n# It is possible to select all of the rows by using a single colon.\n# You can then select columns as normal:\n\n\ndf.loc[:, ['food', 'color']]\n# df[['food','color']]\n# we can do it by both the ways\n\n\n# You can also use this notation to select all of the columns:\ndf.loc['Jane':'Penelope', :]\n\n\ndf.loc[['Jane', 'Aaron'], :]\n\n\n# But, it isn't necessary as we have seen, so you can leave out that last colon:\ndf.loc[['Jane', 'Aaron']]\n\n\n# ## Assign row and column selections to variables\n# It might be easier to assign row and column selections to variables before you use .loc.\n# This is useful if you are selecting many rows or columns:\n\n\nrows = ['Jane', 'Niko', 'Dean', 'Cornelia']\ncols = ['color', 'food', 'age', 'height']\n\n\ndf.loc[rows, cols]\n\n\n# ## Summary of .loc\n# Only uses labels\n# Can select rows and columns simultaneously\n# Selection can be a single label, a list of labels or a slice of labels\n# Put a comma between row and column selections\n#\n# # Getting started with .iloc\n# The .iloc indexer is very similar to .loc but only uses integer locations to make its selections.\n# The word .iloc itself stands for integer location so that should help with remember what it does.\n#\n# ## Selecting a single row with .iloc\n# By passing a single integer to .iloc, it will select one row as a Series:\n\n\ndf.iloc[0] # it's giving the first tuple\n\n\n# ## Selecting multiple rows with .iloc\n# Use a list of integers to select multiple rows:\n\n\ndf.iloc[[5, 2, 4]] # df.iloc[5, 2, 4] Error!\n\n\n# ## Use slice notation to select a range of rows with .iloc\n# Slice notation works just like a list in this instance and is exclusive of the last element\n\n\ndf.iloc[3:5]\n\n\n# ## Selecting rows and columns simultaneously with .iloc\n# Just like with .iloc any combination of a single integer, lists of integers or slices can be used\n# to select rows and columns simultaneously. Just remember to separate the selections with a comma.\n#\n\n\n\n", "project_metadata": {"full_name": "debaonline4u/Python_Programming", "description": "Repository Contains Projects done in Python", "topics": [], "git_url": "git://github.com/debaonline4u/Python_Programming.git", "stars": 24, "watchers": 24, "forks": 10, "created": "2018-06-23T19:22:38Z", "size": 2832, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 979455, "Python": 24732}, "last_updated": "2021-01-03T16:43:00Z"}, "intent": "# Select two rows and two columns:"}, {"original_comment": "# Automatic gradient calculation\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# Hello Kagglers!! Enjoying the competitions and Kernels? I bet you should be. Well, today I am gonna present you something totally new which is totally awesome. If you remember, inspired by the imperative style of **PyTorch**, **TF** developers introduced the **Eager mode** finally. Even though it is out, but I guess most of you wouldn't have tried it (at least on Kaggle Kernels). Yes, you guessed it correctly!! Today's kernel is to show how can you get PyTorch style and the combined power of TF in Eager mode. **My reaction after using it?**\n# ![LEGENDARY!!](https://media.giphy.com/media/3ohzdIuqJoo8QdKlnW/giphy.gif)\n\n#%%\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in\nfrom subprocess import check_output\nfrom sklearn.model_selection import train_test_split\nfrom keras.utils import to_categorical\nfrom skimage.io import imread, imsave, imshow\nfrom keras.preprocessing import image\nimport os\nimport glob\nfrom pathlib import Path\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport tensorflow as tf\nimport tensorflow.contrib.eager as tfe\ntfe.enable_eager_execution() # enable the eager mode before doing any operation\n\n\nnp.random.seed(111)\ncolor = sns.color_palette()\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n# Any results you write to the current directory are saved as output.\n\n\n# FMNIST is a very cool dataset. MNIST, in my opinion, has become way too old to experiment and validate models. Replace MNIST with FMNIST!!\n\n#%%\n\n# Read the train and test csv first\ntrain = pd.read_csv('../input/fashion-mnist_train.csv')\ntest = pd.read_csv('../input/fashion-mnist_test.csv')\n\nprint(\"Number of training samples: \", len(train))\nprint(\"Number of test samples: \", len(test))\n\n#%%\n\n# Let's look at how the train dataset looks like\ntrain.sample(10)\n\n\n# So, for each sample, there are 785 columns out of which the first column represents the label of the corresponding sample and the other 784 columns are the pixel values of a (28x28) image. Now, let's look at the test dataset too\n\n#%%\n\n# Random samples from test data\ntest.sample(10)\n\n\n# There are a total of **10 categories** in FMNIST dataset. I haven't read the description fully (LOL!!), so I will inspect the number of samples for each category myself. Let's do that first.\n\n#%%\n\n# Get the count for each label\nlabel_count = train[\"label\"].value_counts()\n\n# Get total number of samples\ntotal_samples = len(train)\n\n# Make a dictionary for all the labels.\nlabels = {0: \"T-shirt/top\", 1: \"Trouser\", 2: \"Pullover\", 3: \"Dress\", 4: \"Coat\",\n 5: \"Sandal\", 6: \"Shirt\", 7: \"Sneaker\", 8: \"Bag\", 9: \"Ankle Boot\"}\n\nfor i in range(len(label_count)):\n label = labels[label_count.index[i]]\n count = label_count.values[i]\n pct = (count / total_samples) * 100\n print(\"{:<15s}: {} or {}%\".format(label, count, pct))\n\n\n# Have I done something wrong here? Such a balanced dataset, so unrealistic!! Can't believe my eyes as I am seeing such a balanced dataset after such a long time. Let's quickly plot some samples for each category.\n\n#%%\n\n# An empty list to collect some samples\nsample_images = []\n\n# Iterate over the keys of the labels dictionary defined in the above cell\nfor k in labels.keys():\n # Get two samples for each category\n samples = train[train[\"label\"] == k].head(2)\n # Append the samples to the samples list\n for j, s in enumerate(samples.values):\n # First column contain labels, hence index should start from 1\n img = np.array(samples.iloc[j, 1:]).reshape(28, 28)\n sample_images.append(img)\n\nprint(\"Total number of sample images to plot: \", len(sample_images))\n\n#%%\n\n# Plot the sample images now\nf, ax = plt.subplots(5, 4, figsize=(15, 10))\n\nfor i, img in enumerate(sample_images):\n ax[i//4, i % 4].imshow(img, cmap='gray')\n ax[i//4, i % 4].axis('off')\nplt.show()\n\n\n# Ha!! Nice plot. Isn't it?\n#\n# ## Preprocessing of data\n\n#%%\n\n# Separate the labels from train and test dataframe\ntr_labels = train[\"label\"]\nts_labels = test[\"label\"]\n\n# Drop the labels column from train dataframe as well as test dataframe\ntrain = train.drop([\"label\"], axis=1)\ntest = test.drop([\"label\"], axis=1)\n\n# Split the training dataset into training and validation sets\nX_train, X_valid, y_train, y_valid = train_test_split(\n train, tr_labels, test_size=0.2, random_state=111)\nprint(\"Number of samples in the train set: \", len(X_train))\nprint(\"Number of samples in the validation set: \", len(X_valid))\n\n# Just a consistency check\nprint(\"Train and validation shapes: \", end=\" \")\nprint(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)\n\n\n# The pixel values in the dataset has been obtained after flattening the image pixels. There are 784 columns and each image is 28x28 grayscale image. Let's reshape our data properly. Also, there are 10 categories that we want to classify, we will use the `to_categorical` method available in keras for converting the labels to OHE\n\n#%%\n\n# Reshape the data values\nX_train = np.array(X_train.iloc[:, :]).reshape(len(X_train), 28, 28, 1)\nX_valid = np.array(X_valid.iloc[:, :]).reshape(len(X_valid), 28, 28, 1)\nX_test = np.array(test.iloc[:, :]).reshape(len(test), 28, 28, 1)\n\n# Some more preprocessing\nX_train = X_train.astype(np.float32)\nX_valid = X_valid.astype(np.float32)\nX_test = X_test.astype(np.float32)\n\ntrain_mean = X_train.mean()\n\n# Mean subtraction from pixels\nX_train -= train_mean\nX_valid -= train_mean\nX_test -= train_mean\n\n# Normalization\nX_train /= 255.\nX_valid /= 255.\nX_test /= 255.\n\n# One Hot Encoding(OHE)\ny_train = to_categorical(y_train, num_classes=10).astype(np.int8)\ny_valid = to_categorical(y_valid, num_classes=10).astype(np.int8)\n\nprint(\"X_train shape: {}, y_train shape: {} \".format(\n X_train.shape, y_train.shape))\nprint(\"X_valid shape: {}, y_valid shape: {} \".format(\n X_valid.shape, y_valid.shape))\nprint(\"X_test shape: \", X_test.shape)\n\n\n# Great. Before moving to building blocks of our architecture, let's define a simple data generator for our model first\n\n#%%\n\n# A simple data generator\ndef data_gen(data, labels, batch_size=8):\n # Get total number of samples in the data\n n = len(data)\n\n # Define two numpy arrays for containing batch data and labels\n batch_data = np.zeros((batch_size, 28, 28, 1), dtype=np.float32)\n batch_labels = np.zeros((batch_size, 10), dtype=np.int8)\n\n # Get a numpy array of all the indices of the input data\n indices = np.arange(n)\n\n # Initialize a counter\n i = 0\n while True:\n np.random.shuffle(indices)\n # Get the next batch\n next_batch = indices[(i*batch_size):(i+1)*batch_size]\n for j, idx in enumerate(next_batch):\n batch_data[j] = data[idx]\n batch_labels[j] = labels[idx]\n\n yield batch_data, batch_labels\n i += 1\n\n\n# ## Eager mode begins!!\n#\n# Wait a second. What is **Eager execution** at all?\n#\n# (From Tensorflow docs)\n#\n# Eager execution is a feature that makes TensorFlow execute operations immediately: concrete values are returned, instead of a computational graph to be executed later.\n#\n# As a result, enabling eager execution provides:\n# * A NumPy-like library for numerical computation with support for GPU acceleration and automatic differentiation.\n# * A flexible platform for machine learning research and experimentation.\n#\n# It gives you an imperative way of defining your models. Now you may ask: **Why on the earth does it matter**?\n# Well, there are a lot of reasons but the most simple one is this:\n#\n# ***I want to write and execute everything as if I am writing a pure Python code, no tf.Session() and other things***\n#\n\n# ### What is the best way to define a model?\n#\n# You can define your model as you want. You could define your model inside a function or you could define it inside a class, it's totally up to you. If you ask me, I like to keep associated things together and this is where I love to write OOP. The following point summarizes how I have defined the model for this notebook.\n#\n# * Define a class FMNIST\n# * The constructor part (__init__): You should define all the layers that you are gonna use in your network here. It is **highly recommended** to use high-level **tf.layers** API for defining your layers\n# * Other methods that like fit/predict, that you want to use for training and inference purposes. In the code below, I have defined just the predict function for the sake of simplicity here. You can define fit and predict both separately if you want.\n#\n# * Define your cost function and your metric function(like accuracy, precision, etc)\n# * Instantiate your model\n# * Instantiate your optimizer\n# * Define your gradient calculations (I will explain this later in the notebook)\n# * Train and validate your network\n\n# ### Model\n\n#%%\n\n# Class represnting our model\nclass FMNIST(object):\n def __init__(self, data_format):\n # Set the input shape according to the availability of GPU\n if data_format == 'channels_first':\n self._input_shape = [-1, 1, 28, 28]\n else:\n self._input_shape = [-1, 28, 28, 1]\n\n # Start defining the type of layers that you want in your network\n self.conv1 = tf.layers.Conv2D(32, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n\n self.maxpool = tf.layers.MaxPooling2D((2, 2), (2, 2),\n padding='same',\n data_format=data_format)\n\n self.conv2 = tf.layers.Conv2D(64, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n self.conv3 = tf.layers.Conv2D(128, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n\n self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)\n self.dense2 = tf.layers.Dense(512, activation=tf.nn.relu)\n self.dropout = tf.layers.Dropout(0.5)\n self.dense3 = tf.layers.Dense(10)\n\n # Combine the layers to form the architecture\n\n def predict(self, inputs, drop=False):\n x = tf.reshape(inputs, self._input_shape)\n x = self.conv1(x)\n x = self.maxpool(x)\n x = self.conv2(x)\n x = self.maxpool(x)\n x = self.conv3(x)\n x = self.maxpool(x)\n x = tf.layers.flatten(x)\n x = self.dense1(x)\n # enable at training and disable at testing\n x = self.dropout(x, training=drop)\n x = self.dense2(x)\n x = self.dropout(x, training=drop)\n x = self.dense3(x)\n return x\n\n\n# ### Cost function/loss function\n\n#%%\n\n# There are 10 categories, hence we will be using the cross-entropy loss here\ndef loss(model, inputs, targets, drop=False):\n return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n logits=model.predict(inputs, drop=drop), labels=targets))\n\n\n# ### Metric calculation\n\n#%%\n\n# In our case, accuracy will be the metric that we are going to use for evaluation\ndef compute_accuracy(predictions, labels):\n model_pred = tf.argmax(predictions, axis=1, output_type=tf.int64)\n actual_labels = tf.argmax(labels, axis=1, output_type=tf.int64)\n return tf.reduce_sum(tf.cast(tf.equal(model_pred, actual_labels), dtype=tf.float32)) / float(predictions.shape[0].value)\n\n\n# There are four things that are going on in the next cell.\n# 1. Device selection: If GPU is there, data format should be NCHW as it is more optimized for GPU operations. If only CPU is there, the data format should be NHWC as it works better this way on CPU.\n#\n# 2. Model instantiation\n#\n# 3. Optimizer selection\n#\n# 4. Gradient calculations: Although you can write your own function that calculates the gradient for each trainable variable for backpropagation but as the number of variables grows it can be hard to write one. The good thing is that TF provides implicit automatic differentiation. The only thing that you need to do is to pass your loss function name as a parameter to the `tfe.implicit_gradient()` method.\n\n#%%\n\n# Device selection\ndevice = \"gpu:0\" if tfe.num_gpus() else \"cpu:0\"\n\n# Get an instance of your model\nmodel = FMNIST('channels_first' if tfe.num_gpus() else 'channels_last')\n\n# Define an optimizer\noptimizer = tf.train.AdamOptimizer(learning_rate=1e-4)", "target_code": "grad = tfe.implicit_gradients(loss)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# Hello Kagglers!! Enjoying the competitions and Kernels? I bet you should be. Well, today I am gonna present you something totally new which is totally awesome. If you remember, inspired by the imperative style of **PyTorch**, **TF** developers introduced the **Eager mode** finally. Even though it is out, but I guess most of you wouldn't have tried it (at least on Kaggle Kernels). Yes, you guessed it correctly!! Today's kernel is to show how can you get PyTorch style and the combined power of TF in Eager mode. **My reaction after using it?**\n# ![LEGENDARY!!](https://media.giphy.com/media/3ohzdIuqJoo8QdKlnW/giphy.gif)\n\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in\nfrom subprocess import check_output\nfrom sklearn.model_selection import train_test_split\nfrom keras.utils import to_categorical\nfrom skimage.io import imread, imsave, imshow\nfrom keras.preprocessing import image\nimport os\nimport glob\nfrom pathlib import Path\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport tensorflow as tf\nimport tensorflow.contrib.eager as tfe\ntfe.enable_eager_execution() # enable the eager mode before doing any operation\n\n\nnp.random.seed(111)\ncolor = sns.color_palette()\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nprint(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n# Any results you write to the current directory are saved as output.\n\n\n# FMNIST is a very cool dataset. MNIST, in my opinion, has become way too old to experiment and validate models. Replace MNIST with FMNIST!!\n\n\n# Read the train and test csv first\ntrain = pd.read_csv('../input/fashion-mnist_train.csv')\ntest = pd.read_csv('../input/fashion-mnist_test.csv')\n\nprint(\"Number of training samples: \", len(train))\nprint(\"Number of test samples: \", len(test))\n\n\n# Let's look at how the train dataset looks like\ntrain.sample(10)\n\n\n# So, for each sample, there are 785 columns out of which the first column represents the label of the corresponding sample and the other 784 columns are the pixel values of a (28x28) image. Now, let's look at the test dataset too\n\n\n# Random samples from test data\ntest.sample(10)\n\n\n# There are a total of **10 categories** in FMNIST dataset. I haven't read the description fully (LOL!!), so I will inspect the number of samples for each category myself. Let's do that first.\n\n\n# Get the count for each label\nlabel_count = train[\"label\"].value_counts()\n\n# Get total number of samples\ntotal_samples = len(train)\n\n# Make a dictionary for all the labels.\nlabels = {0: \"T-shirt/top\", 1: \"Trouser\", 2: \"Pullover\", 3: \"Dress\", 4: \"Coat\",\n 5: \"Sandal\", 6: \"Shirt\", 7: \"Sneaker\", 8: \"Bag\", 9: \"Ankle Boot\"}\n\nfor i in range(len(label_count)):\n label = labels[label_count.index[i]]\n count = label_count.values[i]\n pct = (count / total_samples) * 100\n print(\"{:<15s}: {} or {}%\".format(label, count, pct))\n\n\n# Have I done something wrong here? Such a balanced dataset, so unrealistic!! Can't believe my eyes as I am seeing such a balanced dataset after such a long time. Let's quickly plot some samples for each category.\n\n\n# An empty list to collect some samples\nsample_images = []\n\n# Iterate over the keys of the labels dictionary defined in the above cell\nfor k in labels.keys():\n # Get two samples for each category\n samples = train[train[\"label\"] == k].head(2)\n # Append the samples to the samples list\n for j, s in enumerate(samples.values):\n # First column contain labels, hence index should start from 1\n img = np.array(samples.iloc[j, 1:]).reshape(28, 28)\n sample_images.append(img)\n\nprint(\"Total number of sample images to plot: \", len(sample_images))\n\n\n# Plot the sample images now\nf, ax = plt.subplots(5, 4, figsize=(15, 10))\n\nfor i, img in enumerate(sample_images):\n ax[i//4, i % 4].imshow(img, cmap='gray')\n ax[i//4, i % 4].axis('off')\nplt.show()\n\n\n# Ha!! Nice plot. Isn't it?\n#\n# ## Preprocessing of data\n\n\n# Separate the labels from train and test dataframe\ntr_labels = train[\"label\"]\nts_labels = test[\"label\"]\n\n# Drop the labels column from train dataframe as well as test dataframe\ntrain = train.drop([\"label\"], axis=1)\ntest = test.drop([\"label\"], axis=1)\n\n# Split the training dataset into training and validation sets\nX_train, X_valid, y_train, y_valid = train_test_split(\n train, tr_labels, test_size=0.2, random_state=111)\nprint(\"Number of samples in the train set: \", len(X_train))\nprint(\"Number of samples in the validation set: \", len(X_valid))\n\n# Just a consistency check\nprint(\"Train and validation shapes: \", end=\" \")\nprint(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)\n\n\n# The pixel values in the dataset has been obtained after flattening the image pixels. There are 784 columns and each image is 28x28 grayscale image. Let's reshape our data properly. Also, there are 10 categories that we want to classify, we will use the `to_categorical` method available in keras for converting the labels to OHE\n\n\n# Reshape the data values\nX_train = np.array(X_train.iloc[:, :]).reshape(len(X_train), 28, 28, 1)\nX_valid = np.array(X_valid.iloc[:, :]).reshape(len(X_valid), 28, 28, 1)\nX_test = np.array(test.iloc[:, :]).reshape(len(test), 28, 28, 1)\n\n# Some more preprocessing\nX_train = X_train.astype(np.float32)\nX_valid = X_valid.astype(np.float32)\nX_test = X_test.astype(np.float32)\n\ntrain_mean = X_train.mean()\n\n# Mean subtraction from pixels\nX_train -= train_mean\nX_valid -= train_mean\nX_test -= train_mean\n\n# Normalization\nX_train /= 255.\nX_valid /= 255.\nX_test /= 255.\n\n# One Hot Encoding(OHE)\ny_train = to_categorical(y_train, num_classes=10).astype(np.int8)\ny_valid = to_categorical(y_valid, num_classes=10).astype(np.int8)\n\nprint(\"X_train shape: {}, y_train shape: {} \".format(\n X_train.shape, y_train.shape))\nprint(\"X_valid shape: {}, y_valid shape: {} \".format(\n X_valid.shape, y_valid.shape))\nprint(\"X_test shape: \", X_test.shape)\n\n\n# Great. Before moving to building blocks of our architecture, let's define a simple data generator for our model first\n\n\n# A simple data generator\ndef data_gen(data, labels, batch_size=8):\n # Get total number of samples in the data\n n = len(data)\n\n # Define two numpy arrays for containing batch data and labels\n batch_data = np.zeros((batch_size, 28, 28, 1), dtype=np.float32)\n batch_labels = np.zeros((batch_size, 10), dtype=np.int8)\n\n # Get a numpy array of all the indices of the input data\n indices = np.arange(n)\n\n # Initialize a counter\n i = 0\n while True:\n np.random.shuffle(indices)\n # Get the next batch\n next_batch = indices[(i*batch_size):(i+1)*batch_size]\n for j, idx in enumerate(next_batch):\n batch_data[j] = data[idx]\n batch_labels[j] = labels[idx]\n\n yield batch_data, batch_labels\n i += 1\n\n\n# ## Eager mode begins!!\n#\n# Wait a second. What is **Eager execution** at all?\n#\n# (From Tensorflow docs)\n#\n# Eager execution is a feature that makes TensorFlow execute operations immediately: concrete values are returned, instead of a computational graph to be executed later.\n#\n# As a result, enabling eager execution provides:\n# * A NumPy-like library for numerical computation with support for GPU acceleration and automatic differentiation.\n# * A flexible platform for machine learning research and experimentation.\n#\n# It gives you an imperative way of defining your models. Now you may ask: **Why on the earth does it matter**?\n# Well, there are a lot of reasons but the most simple one is this:\n#\n# ***I want to write and execute everything as if I am writing a pure Python code, no tf.Session() and other things***\n#\n\n# ### What is the best way to define a model?\n#\n# You can define your model as you want. You could define your model inside a function or you could define it inside a class, it's totally up to you. If you ask me, I like to keep associated things together and this is where I love to write OOP. The following point summarizes how I have defined the model for this notebook.\n#\n# * Define a class FMNIST\n# * The constructor part (__init__): You should define all the layers that you are gonna use in your network here. It is **highly recommended** to use high-level **tf.layers** API for defining your layers\n# * Other methods that like fit/predict, that you want to use for training and inference purposes. In the code below, I have defined just the predict function for the sake of simplicity here. You can define fit and predict both separately if you want.\n#\n# * Define your cost function and your metric function(like accuracy, precision, etc)\n# * Instantiate your model\n# * Instantiate your optimizer\n# * Define your gradient calculations (I will explain this later in the notebook)\n# * Train and validate your network\n\n# ### Model\n\n\n# Class represnting our model\nclass FMNIST(object):\n def __init__(self, data_format):\n # Set the input shape according to the availability of GPU\n if data_format == 'channels_first':\n self._input_shape = [-1, 1, 28, 28]\n else:\n self._input_shape = [-1, 28, 28, 1]\n\n # Start defining the type of layers that you want in your network\n self.conv1 = tf.layers.Conv2D(32, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n\n self.maxpool = tf.layers.MaxPooling2D((2, 2), (2, 2),\n padding='same',\n data_format=data_format)\n\n self.conv2 = tf.layers.Conv2D(64, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n self.conv3 = tf.layers.Conv2D(128, 3,\n activation=tf.nn.relu,\n padding='same',\n data_format=data_format)\n\n self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)\n self.dense2 = tf.layers.Dense(512, activation=tf.nn.relu)\n self.dropout = tf.layers.Dropout(0.5)\n self.dense3 = tf.layers.Dense(10)\n\n # Combine the layers to form the architecture\n\n def predict(self, inputs, drop=False):\n x = tf.reshape(inputs, self._input_shape)\n x = self.conv1(x)\n x = self.maxpool(x)\n x = self.conv2(x)\n x = self.maxpool(x)\n x = self.conv3(x)\n x = self.maxpool(x)\n x = tf.layers.flatten(x)\n x = self.dense1(x)\n # enable at training and disable at testing\n x = self.dropout(x, training=drop)\n x = self.dense2(x)\n x = self.dropout(x, training=drop)\n x = self.dense3(x)\n return x\n\n\n# ### Cost function/loss function\n\n\n# There are 10 categories, hence we will be using the cross-entropy loss here\ndef loss(model, inputs, targets, drop=False):\n return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n logits=model.predict(inputs, drop=drop), labels=targets))\n\n\n# ### Metric calculation\n\n\n# In our case, accuracy will be the metric that we are going to use for evaluation\ndef compute_accuracy(predictions, labels):\n model_pred = tf.argmax(predictions, axis=1, output_type=tf.int64)\n actual_labels = tf.argmax(labels, axis=1, output_type=tf.int64)\n return tf.reduce_sum(tf.cast(tf.equal(model_pred, actual_labels), dtype=tf.float32)) / float(predictions.shape[0].value)\n\n\n# There are four things that are going on in the next cell.\n# 1. Device selection: If GPU is there, data format should be NCHW as it is more optimized for GPU operations. If only CPU is there, the data format should be NHWC as it works better this way on CPU.\n#\n# 2. Model instantiation\n#\n# 3. Optimizer selection\n#\n# 4. Gradient calculations: Although you can write your own function that calculates the gradient for each trainable variable for backpropagation but as the number of variables grows it can be hard to write one. The good thing is that TF provides implicit automatic differentiation. The only thing that you need to do is to pass your loss function name as a parameter to the `tfe.implicit_gradient()` method.\n\n\n# Device selection\ndevice = \"gpu:0\" if tfe.num_gpus() else \"cpu:0\"\n\n# Get an instance of your model\nmodel = FMNIST('channels_first' if tfe.num_gpus() else 'channels_last')\n\n# Define an optimizer\noptimizer = tf.train.AdamOptimizer(learning_rate=1e-4)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# calculate automatic gradient"}, {"original_comment": "# Use matplotlib to draw the points onto the map.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Observation from the Open City Nature Challange\n#\n# RQ1\n# Can we identify macroscopic laws of citizen science projects so that long term dynamics is different?\n#\n# RQ2\n# How communities are growing in time, what drives their growth if not preferential attachment or exogeneous factors?\n#\n# ## Data\n# Muki downloaded all the CNC observations for the bounding box of Europe (metadata below) that includes the geographical locations - it's 62K observations.\n#\n# Query quality_grade=any&identifications=any&swlat=35.327868&swlng=-15.438348&nelat=61.352386&nelng=32.898351&projects[]=city-nature-challenge-2019\n# Columns id, observed_on_string, observed_on, time_observed_at, time_zone, out_of_range, user_id, user_login, created_at, updated_at, quality_grade, license, url, image_url, sound_url, tag_list, description, id_please, num_identification_agreements, num_identification_disagreements, captive_cultivated, oauth_application_id, place_guess, latitude, longitude, positional_accuracy, geoprivacy, taxon_geoprivacy, coordinates_obscured, positioning_method, positioning_device, species_guess, scientific_name, common_name, iconic_taxon_name, taxon_id, taxon_kingdom_name, taxon_phylum_name, taxon_subphylum_name, taxon_superclass_name, taxon_class_name, taxon_subclass_name, taxon_superorder_name, taxon_order_name, taxon_suborder_name, taxon_superfamily_name, taxon_family_name, taxon_subfamily_name, taxon_supertribe_name, taxon_tribe_name, taxon_subtribe_name, taxon_genus_name, taxon_genushybrid_name, taxon_species_name, taxon_hybrid_name, taxon_subspecies_name, taxon_variety_name, taxon_form_name\n#\n# We will focus in particular on data from UK and in particular on London (51.5074\u00b0 N, 0.1278\u00b0 W).\n# Shape of the data is 62246, 58.\n\n#%%\n\nimport folium\nfrom mpl_toolkits.basemap import Basemap\nimport seaborn\nimport matplotlib.pyplot as plt\nimport math\nimport plotly.plotly as py\nimport plotly.graph_objs as go\nimport pandas as pd\nimport csv\n\n\n# load data on trajectories, it is very heavy\n\ndf_cit_sci = pd.read_csv(\n 'C:/Users/lyubo/Documents/DATA_networks/data_citizen_science/observations-65163_space.csv')\n\nprint(df_cit_sci.shape)\n\n#%%\n\ndf_cit_sci.head(10)\n\n#%%\n\nprint(df_cit_sci.columns)\n\n\n# ### Plot data about participants and findings on a map\n#\n# From 62246 unique records of users we plot their distribution on a map.\n\n#%%\n\n# setup Lambert Conformal basemap.\n# set resolution=None to skip processing of boundary datasets.\n# Create a map on which to draw.\n# Use mercator projection, and showing the whole world.\n\nfig, ax = plt.subplots(figsize=(60, 60))\n\n# Berlin Latitude: 52.520008, longitude: 13.404954.\n# NYC 40.730610, and the longitude is -73.935242.\n# We want to plot only Berlin surrounding areas\nm = Basemap(projection='merc', llcrnrlat=49, urcrnrlat=59,\n llcrnrlon=-8.5, urcrnrlon=3, lat_ts=20, resolution='c')\n# Draw coastlines, and the edges of the map. NASA bluemarble\nm.shadedrelief() # m.bluemarble()\nm.drawcoastlines()\nm.drawmapboundary()\n\n# Convert latitude and longitude to x and y coordinates\n# TODO: verify if x and y correspond well to lon, lat\nx, y = m(list(df_cit_sci[\"longitude\"].astype(float)),\n list(df_cit_sci[\"latitude\"].astype(float)))\n\n# print(type(x))", "target_code": "plt.show()\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Observation from the Open City Nature Challange\n#\n# RQ1\n# Can we identify macroscopic laws of citizen science projects so that long term dynamics is different?\n#\n# RQ2\n# How communities are growing in time, what drives their growth if not preferential attachment or exogeneous factors?\n#\n# ## Data\n# Muki downloaded all the CNC observations for the bounding box of Europe (metadata below) that includes the geographical locations - it's 62K observations.\n#\n# Query quality_grade=any&identifications=any&swlat=35.327868&swlng=-15.438348&nelat=61.352386&nelng=32.898351&projects[]=city-nature-challenge-2019\n# Columns id, observed_on_string, observed_on, time_observed_at, time_zone, out_of_range, user_id, user_login, created_at, updated_at, quality_grade, license, url, image_url, sound_url, tag_list, description, id_please, num_identification_agreements, num_identification_disagreements, captive_cultivated, oauth_application_id, place_guess, latitude, longitude, positional_accuracy, geoprivacy, taxon_geoprivacy, coordinates_obscured, positioning_method, positioning_device, species_guess, scientific_name, common_name, iconic_taxon_name, taxon_id, taxon_kingdom_name, taxon_phylum_name, taxon_subphylum_name, taxon_superclass_name, taxon_class_name, taxon_subclass_name, taxon_superorder_name, taxon_order_name, taxon_suborder_name, taxon_superfamily_name, taxon_family_name, taxon_subfamily_name, taxon_supertribe_name, taxon_tribe_name, taxon_subtribe_name, taxon_genus_name, taxon_genushybrid_name, taxon_species_name, taxon_hybrid_name, taxon_subspecies_name, taxon_variety_name, taxon_form_name\n#\n# We will focus in particular on data from UK and in particular on London (51.5074\u00b0 N, 0.1278\u00b0 W).\n# Shape of the data is 62246, 58.\n\n\nimport folium\nfrom mpl_toolkits.basemap import Basemap\nimport seaborn\nimport matplotlib.pyplot as plt\nimport math\nimport plotly.plotly as py\nimport plotly.graph_objs as go\nimport pandas as pd\nimport csv\n\n\n# load data on trajectories, it is very heavy\n\ndf_cit_sci = pd.read_csv(\n 'C:/Users/lyubo/Documents/DATA_networks/data_citizen_science/observations-65163_space.csv')\n\nprint(df_cit_sci.shape)\n\n\ndf_cit_sci.head(10)\n\n\nprint(df_cit_sci.columns)\n\n\n# ### Plot data about participants and findings on a map\n#\n# From 62246 unique records of users we plot their distribution on a map.\n\n\n# setup Lambert Conformal basemap.\n# set resolution=None to skip processing of boundary datasets.\n# Create a map on which to draw.\n# Use mercator projection, and showing the whole world.\n\nfig, ax = plt.subplots(figsize=(60, 60))\n\n# Berlin Latitude: 52.520008, longitude: 13.404954.\n# NYC 40.730610, and the longitude is -73.935242.\n# We want to plot only Berlin surrounding areas\nm = Basemap(projection='merc', llcrnrlat=49, urcrnrlat=59,\n llcrnrlon=-8.5, urcrnrlon=3, lat_ts=20, resolution='c')\n# Draw coastlines, and the edges of the map. NASA bluemarble\nm.shadedrelief() # m.bluemarble()\nm.drawcoastlines()\nm.drawmapboundary()\n\n# Convert latitude and longitude to x and y coordinates\n# TODO: verify if x and y correspond well to lon, lat\nx, y = m(list(df_cit_sci[\"longitude\"].astype(float)),\n list(df_cit_sci[\"latitude\"].astype(float)))\n\n# print(type(x))\nm.scatter(x, y, 1, marker='o', color='red')\n# Show the plot.\n", "project_metadata": {"full_name": "Liyubov/citizen_science", "description": "Data analysis for citizen science projects", "topics": [], "git_url": "git://github.com/Liyubov/citizen_science.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2019-10-07T18:08:06Z", "size": 6038, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 20539735, "Python": 4486}, "last_updated": "2020-10-20T22:44:28Z"}, "intent": "# draw the points onto the map."}, {"original_comment": "# Let's see what t-SNE will give us here :\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n\nfrom sklearn.manifold import TSNE\nimport numpy as np # linear algebra\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nfrom subprocess import check_output\n\n\n# # **Air Pollution - Learn the basics**\n# ***\n#\n# **Mhamed Jabri \u2014 02/19/2018**\n#\n# Air pollution means exactly what you think it means : it's when the quality of the air you breathe drops. But how does that happen ? That's the real question. It's induced by the presence of harmful, unwanted substances in air (more precisely, into Earth's atmosphere). Those bad substances are the pollutants and most of the tables in this database focus on those pollutants and give information about their preseance in the air.\n# Also, note that [global warming and air pollution are two distinct phenomena](https://www.quora.com/What-is-the-difference-between-air-pollution-and-global-warming) but they do have a lot in common, mainly : **NEITHER ONE OF THEM IS A HOAX !!**\n#\n# By including BigQuery to its kernels, Kaggle allows its users to explores HUGE databeses / datasets which means endless possibilities and discovers to be made. I've decided to create this notebook for two reasons : The first one is that this subject is highly interesting to everyone I think, at least to me, because I think we should all be concerned about the quality of the air we're breathing and how we are affecting it by our daily activities. The second one is that I couldn't wait to use BigQuery on a Jupyter Notebook and see how it goes because it is indeed very exciting !!\n#\n# So I hope that when you're done with this notebook, you'll have learned a thing or two about Air Pollution.\n#\n# If you like the kernel, please leave me a comment / upvote, I would highly appreciate it :) !\n\n# # Table of contents\n# ***\n#\n# * [What's air pollution / air quality ?](#introduction)\n# * [1. Air pollution in the US in 2016](#counties)\n# * [2. Yearly evolution of air quality : Rural vs Urban states](#yearly)\n# * [3. Impact of Temperature and Humidity](#weather)\n# * [4. Worldwide air pollution](#world)\n# * [Conclusion](#conclusion)\n#\n\n# ![](http://upload.wikimedia.org/wikipedia/commons/thumb/1/14/Air_Pollution-Causes%26Effects.svg/1052px-Air_Pollution-Causes%26Effects.svg.png)\n\n# # What's air pollution / air quality ?\n# \n# ***\n#\n# As said earlier, air pollution is due to the presence of some pollutants, it's time to learn more about the main pollutants that are present in our database :\n# * Sulphur dioxide ($SO_2$) : This contaminant is mainly emitted during the combustion of fossil fuels such as crude oil and coal.\n# * Carbon monoxide ($CO$) : This gas consists during incomplete combustion of fuels example : A car engine running in a closed room.\n# * Nitrogen dioxide ($NO_2$) : These contaminants are emitted by traffic, combustion installations and the industries.\n# * Ozone ($O_3$) : Ozone is created through the influence of ultra violet sunlight (UV) on pollutants in the outside air.\n# * Particulate Matter ($PM$) : Particulate matter is the sum of all solid and liquid particles suspended in air. This complex mixture includes both organic and inorganic particles, such as dust, pollen, soot, smoke, and liquid droplets. These particles vary greatly in size, composition, and origin.\n#\n# So how are those pollutants produced in our daily lives ? Well as you may have guessed, The main sources of air pollution are the industries, agriculture and traffic (held responsible for one-third of the greenhouse gas emissions). That being said, us consumers are also responsible of polluting the air through some of our activities such as smoking or heating houses ...\n# There's also the effect of weather (wind, temperature, ultra violet sunlight for the Ozone ...), the interactions of all those things provides the picture above which sums it up nicely actually.\n#\n# ### Air Quality Index\n#\n# In severel tables in this databse, you'll find a column 'aqi' which stands for Air Quality Index. Basically the AQI is the measure of how air is polluted, with respect to some pollutant. That means that for a specific hour in a specific place you'll have different AQIs, one for each pollutant.\n# So one must know what values of AQI mean 'good' and what values mean 'bad', hence the table below.\n#\n# ![](http://www.deq.idaho.gov/media/818444-aqi_496x260.jpg)\n#\n# I feel like this introduction was important to give some context and explain what air pollution is about so that all that's coming would make sense for the reader. Let's code now !\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom google.cloud import bigquery\nfrom bq_helper import BigQueryHelper\n\nbq_assistant = BigQueryHelper(\n \"bigquery-public-data\", \"epa_historical_air_quality\")\npollutants = ['o3', 'co', 'no2', 'so2', 'pm25_frm']\n\n\n# # Air pollution in the US in 2016\n# \n# ***\n#\n# In this first part, we'll try to get a grasp of how polluted was the air all over the US during 2016.\n#\n# To do so, I wanted to extract the average AQI for each pollutant for each county, which translates to a groupby(county) in SQL/pandas. In other words, since every table contains the information about one single pollutant, the use of JOIN was necessary. However, needing the information about 5 pollutants meant using 4 JOINs and it didn't respect the timeout limit so I needed another way around.\n# **This is where using Python combined to SQL becomes actually cool.** If you're writing such a query in MySQL for example, you won't have many possibilities to 'tweak' it. Here, since the query is actually a string for Python, I wrote a query that I could modify inside a for loop so that in each steap it gives me the information I want about one specific pollutant and at the end, I would just concatenate the dataframes !\n\n#%%\n\nQUERY2016 = \"\"\"\n SELECT\n pollutant.county_name AS County, AVG(pollutant.aqi) AS AvgAQI_pollutant\n FROM\n `bigquery-public-data.epa_historical_air_quality.pollutant_daily_summary` as pollutant\n WHERE\n pollutant.poc = 1\n AND EXTRACT(YEAR FROM pollutant.date_local) = 2016\n GROUP BY \n pollutant.county_name\n\"\"\"\n\ndf_2016 = None\nfor elem_g in pollutants:\n query = QUERY2016.replace(\"pollutant\", elem_g)\n temp = bq_assistant.query_to_pandas(query).set_index('County')\n df_2016 = pd.concat([df_2016, temp], axis=1, join='outer')\ndf_2016 = df_2016.apply(lambda x: x.fillna(x.mean()), axis=0)\n\ndf_2016.sample(10, random_state=42)\n\n\n# Okay so now we have the 2016 measures for every county in the US, what can we do about it to retrieve some useful insights ? Well, **clustering** is a natural answer in the context !", "target_code": "from sklearn.manifold import TSNE\n\nX_tsne = TSNE(n_components=2, n_iter=2000, perplexity=35,\n random_state=5).fit_transform(df_2016)\ndf_tsne = pd.DataFrame(X_tsne)\ndf_tsne['County'] = list(df_2016.index)\ndf_tsne = df_tsne.set_index('County')\ndf_tsne.columns = ['ax1', 'ax2']\n\ndf_tsne.plot(kind='scatter', x='ax1', y='ax2', figsize=(10, 8))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n\nimport numpy as np # linear algebra\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nfrom subprocess import check_output\n\n\n# # **Air Pollution - Learn the basics**\n# ***\n#\n# **Mhamed Jabri \u2014 02/19/2018**\n#\n# Air pollution means exactly what you think it means : it's when the quality of the air you breathe drops. But how does that happen ? That's the real question. It's induced by the presence of harmful, unwanted substances in air (more precisely, into Earth's atmosphere). Those bad substances are the pollutants and most of the tables in this database focus on those pollutants and give information about their preseance in the air.\n# Also, note that [global warming and air pollution are two distinct phenomena](https://www.quora.com/What-is-the-difference-between-air-pollution-and-global-warming) but they do have a lot in common, mainly : **NEITHER ONE OF THEM IS A HOAX !!**\n#\n# By including BigQuery to its kernels, Kaggle allows its users to explores HUGE databeses / datasets which means endless possibilities and discovers to be made. I've decided to create this notebook for two reasons : The first one is that this subject is highly interesting to everyone I think, at least to me, because I think we should all be concerned about the quality of the air we're breathing and how we are affecting it by our daily activities. The second one is that I couldn't wait to use BigQuery on a Jupyter Notebook and see how it goes because it is indeed very exciting !!\n#\n# So I hope that when you're done with this notebook, you'll have learned a thing or two about Air Pollution.\n#\n# If you like the kernel, please leave me a comment / upvote, I would highly appreciate it :) !\n\n# # Table of contents\n# ***\n#\n# * [What's air pollution / air quality ?](#introduction)\n# * [1. Air pollution in the US in 2016](#counties)\n# * [2. Yearly evolution of air quality : Rural vs Urban states](#yearly)\n# * [3. Impact of Temperature and Humidity](#weather)\n# * [4. Worldwide air pollution](#world)\n# * [Conclusion](#conclusion)\n#\n\n# ![](http://upload.wikimedia.org/wikipedia/commons/thumb/1/14/Air_Pollution-Causes%26Effects.svg/1052px-Air_Pollution-Causes%26Effects.svg.png)\n\n# # What's air pollution / air quality ?\n# \n# ***\n#\n# As said earlier, air pollution is due to the presence of some pollutants, it's time to learn more about the main pollutants that are present in our database :\n# * Sulphur dioxide ($SO_2$) : This contaminant is mainly emitted during the combustion of fossil fuels such as crude oil and coal.\n# * Carbon monoxide ($CO$) : This gas consists during incomplete combustion of fuels example : A car engine running in a closed room.\n# * Nitrogen dioxide ($NO_2$) : These contaminants are emitted by traffic, combustion installations and the industries.\n# * Ozone ($O_3$) : Ozone is created through the influence of ultra violet sunlight (UV) on pollutants in the outside air.\n# * Particulate Matter ($PM$) : Particulate matter is the sum of all solid and liquid particles suspended in air. This complex mixture includes both organic and inorganic particles, such as dust, pollen, soot, smoke, and liquid droplets. These particles vary greatly in size, composition, and origin.\n#\n# So how are those pollutants produced in our daily lives ? Well as you may have guessed, The main sources of air pollution are the industries, agriculture and traffic (held responsible for one-third of the greenhouse gas emissions). That being said, us consumers are also responsible of polluting the air through some of our activities such as smoking or heating houses ...\n# There's also the effect of weather (wind, temperature, ultra violet sunlight for the Ozone ...), the interactions of all those things provides the picture above which sums it up nicely actually.\n#\n# ### Air Quality Index\n#\n# In severel tables in this databse, you'll find a column 'aqi' which stands for Air Quality Index. Basically the AQI is the measure of how air is polluted, with respect to some pollutant. That means that for a specific hour in a specific place you'll have different AQIs, one for each pollutant.\n# So one must know what values of AQI mean 'good' and what values mean 'bad', hence the table below.\n#\n# ![](http://www.deq.idaho.gov/media/818444-aqi_496x260.jpg)\n#\n# I feel like this introduction was important to give some context and explain what air pollution is about so that all that's coming would make sense for the reader. Let's code now !\n\n\nimport pandas as pd\nimport numpy as np\nfrom google.cloud import bigquery\nfrom bq_helper import BigQueryHelper\n\nbq_assistant = BigQueryHelper(\n \"bigquery-public-data\", \"epa_historical_air_quality\")\npollutants = ['o3', 'co', 'no2', 'so2', 'pm25_frm']\n\n\n# # Air pollution in the US in 2016\n# \n# ***\n#\n# In this first part, we'll try to get a grasp of how polluted was the air all over the US during 2016.\n#\n# To do so, I wanted to extract the average AQI for each pollutant for each county, which translates to a groupby(county) in SQL/pandas. In other words, since every table contains the information about one single pollutant, the use of JOIN was necessary. However, needing the information about 5 pollutants meant using 4 JOINs and it didn't respect the timeout limit so I needed another way around.\n# **This is where using Python combined to SQL becomes actually cool.** If you're writing such a query in MySQL for example, you won't have many possibilities to 'tweak' it. Here, since the query is actually a string for Python, I wrote a query that I could modify inside a for loop so that in each steap it gives me the information I want about one specific pollutant and at the end, I would just concatenate the dataframes !\n\n\nQUERY2016 = \"\"\"\n SELECT\n pollutant.county_name AS County, AVG(pollutant.aqi) AS AvgAQI_pollutant\n FROM\n `bigquery-public-data.epa_historical_air_quality.pollutant_daily_summary` as pollutant\n WHERE\n pollutant.poc = 1\n AND EXTRACT(YEAR FROM pollutant.date_local) = 2016\n GROUP BY \n pollutant.county_name\n\"\"\"\n\ndf_2016 = None\nfor elem_g in pollutants:\n query = QUERY2016.replace(\"pollutant\", elem_g)\n temp = bq_assistant.query_to_pandas(query).set_index('County')\n df_2016 = pd.concat([df_2016, temp], axis=1, join='outer')\ndf_2016 = df_2016.apply(lambda x: x.fillna(x.mean()), axis=0)\n\ndf_2016.sample(10, random_state=42)\n\n\n# Okay so now we have the 2016 measures for every county in the US, what can we do about it to retrieve some useful insights ? Well, **clustering** is a natural answer in the context !\n\n\n\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "intent": "# see what t-SNE will give us"}, {"original_comment": " # Append medal_df to medals\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Sampling\n#\n# You can get a randomly rows of the dataset. It is very usefull in training machine learning models.\n# We will use the dataset about movie reviewers obtained of [here](http://grouplens.org/datasets/movielens/100k/).\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n#%%\n\n# read a dataset of movie reviewers into a DataFrame\nuser_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\nusers = pd.read_csv('./dataset/u.user', sep='|', header=None,\n names=user_cols, index_col='user_id')\nusers.head()\n\n#%%\n\n# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)\nusers.sample(n=3)\n\n#%%\n\n# use the 'random_state' parameter for reproducibility\nusers.sample(n=3, random_state=42)\n\n#%%\n\n# sample 75% of the DataFrame's rows without replacement\ntrain = users.sample(frac=0.75, random_state=99)\n\n#%%\n\n# store the remaining 25% of the rows in another DataFrame\ntest = users.loc[~users.index.isin(train.index), :]\n\n#%%\n\ntrain.head()\n\n#%%\n\ntest.head()\n\n#%%\n\n# detect duplicate zip codes: True if an item is identical to a previous item\nusers.zip_code.duplicated().tail()\n\n#%%\n\n# count the duplicate items (True becomes 1, False becomes 0)\nusers.zip_code.duplicated().sum()\n\n#%%\n\n# detect duplicate DataFrame rows: True if an entire row is identical to a previous row\nusers.duplicated().tail()\n\n\n# ### Logic for duplicated:\n#\n# + keep='first' (default): Mark duplicates as True except for the first occurrence.\n# + keep='last': Mark duplicates as True except for the last occurrence.\n# + keep=False: Mark all duplicates as True.\n\n#%%\n\n# examine the duplicate rows (ignoring the first occurrence)\nusers.loc[users.duplicated(keep='first'), :]\n\n#%%\n\n# examine the duplicate rows (ignoring the last occurrence)\nusers.loc[users.duplicated(keep='last'), :]\n\n#%%\n\n# examine the duplicate rows (including all duplicates)\nusers.loc[users.duplicated(keep=False), :]\n\n#%%\n\n# only consider a subset of columns when identifying duplicates\nusers.duplicated(subset=['age', 'zip_code']).sum()\n\n#%%\n\n# drop the duplicate rows (inplace=False by default)\nusers.drop_duplicates(keep='first').shape\n\n#%%\n\nusers.drop_duplicates(keep='last').shape\n\n#%%\n\nusers.drop_duplicates(keep=False).shape\n\n\n# ## Appending pandas Series\n\n#%%\n\n# Load 'sales-jan-2015.csv' into a DataFrame: jan\njan = pd.read_csv('./dataset/sales-jan-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Load 'sales-feb-2015.csv' into a DataFrame: feb\nfeb = pd.read_csv('./dataset/sales-feb-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Load 'sales-mar-2015.csv' into a DataFrame: mar\nmar = pd.read_csv('./dataset/sales-mar-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Extract the 'Units' column from jan: jan_units\njan_units = pd.DataFrame(jan['Units'])\n\n# Extract the 'Units' column from feb: feb_units\nfeb_units = pd.DataFrame(feb['Units'])\n\n# Extract the 'Units' column from mar: mar_units\nmar_units = pd.DataFrame(mar['Units'])\n\n# Append feb_units and then mar_units to jan_units: quarter1\nquarter1 = jan_units.append(feb_units).append(mar_units)\n\n# Print the first slice from quarter1\nprint(quarter1.loc['jan 27, 2015':'feb 2, 2015'])\n\n# Print the second slice from quarter1\nprint(quarter1.loc['feb 26, 2015':'mar 7, 2015'])\n\n# Compute & print total sales in quarter1\nprint(quarter1.sum())\n\n#%%\n\ndf_quarter = pd.DataFrame(quarter1, columns=['Units'])\n\n#%%\n\ndf_quarter\n\n#%%\n\njan_units.reset_index(inplace=True)\nfeb_units.reset_index(inplace=True)\nmar_units.reset_index(inplace=True)\nquarter_columns = pd.concat(\n [jan_units, feb_units, mar_units], axis=1, ignore_index=False)\n\n#%%\n\ndf_quarter_columns = pd.DataFrame(quarter_columns)\n\n#%%\n\ndf_quarter_columns\n\n\n# ## Reading multiple files to build a DataFrame\n#\n# It is often convenient to build a large DataFrame by parsing many files as DataFrames and concatenating them all at once. You'll do this here with three files, but, in principle, this approach can be used to combine data from dozens or hundreds of files.\n#\n# Here, you'll work with DataFrames compiled from The Guardian's Olympic medal dataset.\n\n#%%\n\nmedals = []\nmedal_types = ['gold', 'silver', 'bronze']\nfor medal in medal_types:\n\n # Create the file name: file_name\n file_name = \"./dataset/olympic-medals/%s_top5.csv\" % medal\n\n # Create list of column names: columns\n columns = ['Country', medal]\n\n # Read file_name into a DataFrame: df\n medal_df = pd.read_csv(file_name, header=0,\n index_col='Country', names=columns)\n\n # Append medal_df to medals\n medals.append(medal_df)\n\n# Concatenate medals horizontally: medals\nmedals = pd.concat(medals, axis='columns', sort=True)\n\n# Print medals\npd.DataFrame(medals)\n\n\n# ## Concatenating vertically to get MultiIndexed rows\n#\n# When stacking a sequence of DataFrames vertically, it is sometimes desirable to construct a MultiIndex to indicate the DataFrame from which each row originated. This can be done by specifying the keys parameter in the call to pd.concat(), which generates a hierarchical index with the labels from keys as the outermost index label. So you don't have to rename the columns of each DataFrame as you load it. Instead, only the Index column needs to be specified.\n#\n#\n\n#%%\n\nmedals = []\nfor medal in medal_types:\n\n file_name = \"./dataset/olympic-medals/%s_top5.csv\" % medal\n\n # Read file_name into a DataFrame: medal_df\n medal_df = pd.read_csv(file_name, index_col='Country')", "target_code": " medals.append(medal_df)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Sampling\n#\n# You can get a randomly rows of the dataset. It is very usefull in training machine learning models.\n# We will use the dataset about movie reviewers obtained of [here](http://grouplens.org/datasets/movielens/100k/).\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# read a dataset of movie reviewers into a DataFrame\nuser_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\nusers = pd.read_csv('./dataset/u.user', sep='|', header=None,\n names=user_cols, index_col='user_id')\nusers.head()\n\n\n# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)\nusers.sample(n=3)\n\n\n# use the 'random_state' parameter for reproducibility\nusers.sample(n=3, random_state=42)\n\n\n# sample 75% of the DataFrame's rows without replacement\ntrain = users.sample(frac=0.75, random_state=99)\n\n\n# store the remaining 25% of the rows in another DataFrame\ntest = users.loc[~users.index.isin(train.index), :]\n\n\ntrain.head()\n\n\ntest.head()\n\n\n# detect duplicate zip codes: True if an item is identical to a previous item\nusers.zip_code.duplicated().tail()\n\n\n# count the duplicate items (True becomes 1, False becomes 0)\nusers.zip_code.duplicated().sum()\n\n\n# detect duplicate DataFrame rows: True if an entire row is identical to a previous row\nusers.duplicated().tail()\n\n\n# ### Logic for duplicated:\n#\n# + keep='first' (default): Mark duplicates as True except for the first occurrence.\n# + keep='last': Mark duplicates as True except for the last occurrence.\n# + keep=False: Mark all duplicates as True.\n\n\n# examine the duplicate rows (ignoring the first occurrence)\nusers.loc[users.duplicated(keep='first'), :]\n\n\n# examine the duplicate rows (ignoring the last occurrence)\nusers.loc[users.duplicated(keep='last'), :]\n\n\n# examine the duplicate rows (including all duplicates)\nusers.loc[users.duplicated(keep=False), :]\n\n\n# only consider a subset of columns when identifying duplicates\nusers.duplicated(subset=['age', 'zip_code']).sum()\n\n\n# drop the duplicate rows (inplace=False by default)\nusers.drop_duplicates(keep='first').shape\n\n\nusers.drop_duplicates(keep='last').shape\n\n\nusers.drop_duplicates(keep=False).shape\n\n\n# ## Appending pandas Series\n\n\n# Load 'sales-jan-2015.csv' into a DataFrame: jan\njan = pd.read_csv('./dataset/sales-jan-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Load 'sales-feb-2015.csv' into a DataFrame: feb\nfeb = pd.read_csv('./dataset/sales-feb-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Load 'sales-mar-2015.csv' into a DataFrame: mar\nmar = pd.read_csv('./dataset/sales-mar-2015.csv',\n parse_dates=True, index_col='Date')\n\n# Extract the 'Units' column from jan: jan_units\njan_units = pd.DataFrame(jan['Units'])\n\n# Extract the 'Units' column from feb: feb_units\nfeb_units = pd.DataFrame(feb['Units'])\n\n# Extract the 'Units' column from mar: mar_units\nmar_units = pd.DataFrame(mar['Units'])\n\n# Append feb_units and then mar_units to jan_units: quarter1\nquarter1 = jan_units.append(feb_units).append(mar_units)\n\n# Print the first slice from quarter1\nprint(quarter1.loc['jan 27, 2015':'feb 2, 2015'])\n\n# Print the second slice from quarter1\nprint(quarter1.loc['feb 26, 2015':'mar 7, 2015'])\n\n# Compute & print total sales in quarter1\nprint(quarter1.sum())\n\n\ndf_quarter = pd.DataFrame(quarter1, columns=['Units'])\n\n\ndf_quarter\n\n\njan_units.reset_index(inplace=True)\nfeb_units.reset_index(inplace=True)\nmar_units.reset_index(inplace=True)\nquarter_columns = pd.concat(\n [jan_units, feb_units, mar_units], axis=1, ignore_index=False)\n\n\ndf_quarter_columns = pd.DataFrame(quarter_columns)\n\n\ndf_quarter_columns\n\n\n# ## Reading multiple files to build a DataFrame\n#\n# It is often convenient to build a large DataFrame by parsing many files as DataFrames and concatenating them all at once. You'll do this here with three files, but, in principle, this approach can be used to combine data from dozens or hundreds of files.\n#\n# Here, you'll work with DataFrames compiled from The Guardian's Olympic medal dataset.\n\n\nmedals = []\nmedal_types = ['gold', 'silver', 'bronze']\nfor medal in medal_types:\n\n # Create the file name: file_name\n file_name = \"./dataset/olympic-medals/%s_top5.csv\" % medal\n\n # Create list of column names: columns\n columns = ['Country', medal]\n\n # Read file_name into a DataFrame: df\n medal_df = pd.read_csv(file_name, header=0,\n index_col='Country', names=columns)\n\n # Append medal_df to medals\n medals.append(medal_df)\n\n# Concatenate medals horizontally: medals\nmedals = pd.concat(medals, axis='columns', sort=True)\n\n# Print medals\npd.DataFrame(medals)\n\n\n# ## Concatenating vertically to get MultiIndexed rows\n#\n# When stacking a sequence of DataFrames vertically, it is sometimes desirable to construct a MultiIndex to indicate the DataFrame from which each row originated. This can be done by specifying the keys parameter in the call to pd.concat(), which generates a hierarchical index with the labels from keys as the outermost index label. So you don't have to rename the columns of each DataFrame as you load it. Instead, only the Index column needs to be specified.\n#\n#\n\n\nmedals = []\nfor medal in medal_types:\n\n file_name = \"./dataset/olympic-medals/%s_top5.csv\" % medal\n\n # Read file_name into a DataFrame: medal_df\n medal_df = pd.read_csv(file_name, index_col='Country')\n", "project_metadata": {"full_name": "Cpicon/DataSciencePythonCourse", "description": "This is a course to learn Python focuses in Data Science. ", "topics": [], "git_url": "git://github.com/Cpicon/DataSciencePythonCourse.git", "stars": 4, "watchers": 4, "forks": 3, "created": "2019-01-15T05:51:23Z", "size": 18929, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 25356511}, "last_updated": "2020-12-19T18:01:47Z"}, "intent": " # Append medal_df to medals"}, {"original_comment": "# To create sequences of numbers\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# CSDN\uff1ahttp://blog.csdn.net/kicilove/article/\n#\n# github\uff1ahttps://github.com/zhaohuicici?tab=repositories\n\n#%%\n\nfrom numpy import pi\nimport numpy as np\nimport numpy\n\nworld_alcohol = numpy.genfromtxt(\"world_alcohol.txt\", delimiter=\",\")\nprint(type(world_alcohol))\n\n#%%\n\n# The numpy.array() function can take a list or list of lists as input. When we input a list, we get a one-dimensional array as a result:\nvector = numpy.array([5, 10, 15, 20])\n# When we input a list of lists, we get a matrix as a result:\nmatrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])\nprint(vector)\nprint(matrix)\n\n#%%\n\n# We can use the ndarray.shape property to figure out how many elements are in the array\nvector = numpy.array([1, 2, 3, 4])\nprint(vector.shape)\n# For matrices, the shape property contains a tuple with 2 elements.\nmatrix = numpy.array([[5, 10, 15], [20, 25, 30]])\nprint(matrix.shape)\n\n#%%\n\n# Each value in a NumPy array has to have the same data type\n# NumPy will automatically figure out an appropriate data type when reading in data or converting lists to arrays.\n# You can check the data type of a NumPy array using the dtype property.\nnumbers = numpy.array([1, 2, 3, 4])\nnumbers.dtype\n\n#%%\n\n# When NumPy can't convert a value to a numeric data type like float or integer, it uses a special nan value that stands for Not a Number\n# nan is the missing data\n# 1.98600000e+03 is actually 1.986 * 10 ^ 3\nworld_alcohol\n\n#%%\n\nworld_alcohol = numpy.genfromtxt(\n \"world_alcohol.txt\", delimiter=\",\", dtype=\"U75\", skip_header=1)\nprint(world_alcohol)\n\n#%%\n\nuruguay_other_1986 = world_alcohol[1, 4]\nthird_country = world_alcohol[2, 2]\nprint(uruguay_other_1986)\nprint(third_country)\n\n#%%\n\nvector = numpy.array([5, 10, 15, 20])\nprint(vector[0:3])\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[:, 1])\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[:, 0:2])\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[1:3, 0:2])\n\n#%%\n\n# it will compare the second value to each element in the vector\n# If the values are equal, the Python interpreter returns True; otherwise, it returns False\nvector = numpy.array([5, 10, 15, 20])\nvector == 10\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix == 25\n\n#%%\n\n# Compares vector to the value 10, which generates a new Boolean vector [False, True, False, False]. It assigns this result to equal_to_ten\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten = (vector == 10)\nprint(equal_to_ten)\nprint(vector[equal_to_ten])\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nsecond_column_25 = (matrix[:, 1] == 25)\nprint(second_column_25)\nprint(matrix[second_column_25, :])\n\n#%%\n\n# We can also perform comparisons with multiple conditions\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_and_five = (vector == 10) & (vector == 5)\nprint(equal_to_ten_and_five)\n\n#%%\n\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_or_five = (vector == 10) | (vector == 5)\nprint(equal_to_ten_or_five)\n\n#%%\n\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_or_five = (vector == 10) | (vector == 5)\nvector[equal_to_ten_or_five] = 50\nprint(vector)\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nsecond_column_25 = matrix[:, 1] == 25\nprint(second_column_25)\nmatrix[second_column_25, 1] = 10\nprint(matrix)\n\n#%%\n\n# We can convert the data type of an array with the ndarray.astype() method.\nvector = numpy.array([\"1\", \"2\", \"3\"])\nprint(vector.dtype)\nprint(vector)\nvector = vector.astype(float)\nprint(vector.dtype)\nprint(vector)\n\n#%%\n\nvector = numpy.array([5, 10, 15, 20])\nvector.sum()\n\n#%%\n\n# The axis dictates which dimension we perform the operation on\n# 1 means that we want to perform the operation on each row, and 0 means on each column\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix.sum(axis=1)\n\n#%%\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix.sum(axis=0)\n\n#%%\n\n# replace nan value with 0\nworld_alcohol = numpy.genfromtxt(\"world_alcohol.txt\", delimiter=\",\")\n# print world_alcohol\nis_value_empty = numpy.isnan(world_alcohol[:, 4])\n# print is_value_empty\nworld_alcohol[is_value_empty, 4] = '0'\nalcohol_consumption = world_alcohol[:, 4]\nalcohol_consumption = alcohol_consumption.astype(float)\ntotal_alcohol = alcohol_consumption.sum()\naverage_alcohol = alcohol_consumption.mean()\nprint(total_alcohol)\nprint(average_alcohol)\n\n#%%\n\n\n\n#%%\n\na = np.arange(15).reshape(3, 5)\na\n\n#%%\n\na.shape\n\n#%%\n\n# the number of axes (dimensions) of the array\na.ndim\n\n#%%\n\na.dtype.name\n\n#%%\n\n# the total number of elements of the array\na.size\n\n#%%\n\nnp.zeros((3, 4))\n\n#%%\n\nnp.ones((2, 3, 4), dtype=np.int32)\n\n#%%", "target_code": "np.arange(10, 30, 5)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# CSDN\uff1ahttp://blog.csdn.net/kicilove/article/\n#\n# github\uff1ahttps://github.com/zhaohuicici?tab=repositories\n\n\nfrom numpy import pi\nimport numpy as np\nimport numpy\n\nworld_alcohol = numpy.genfromtxt(\"world_alcohol.txt\", delimiter=\",\")\nprint(type(world_alcohol))\n\n\n# The numpy.array() function can take a list or list of lists as input. When we input a list, we get a one-dimensional array as a result:\nvector = numpy.array([5, 10, 15, 20])\n# When we input a list of lists, we get a matrix as a result:\nmatrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])\nprint(vector)\nprint(matrix)\n\n\n# We can use the ndarray.shape property to figure out how many elements are in the array\nvector = numpy.array([1, 2, 3, 4])\nprint(vector.shape)\n# For matrices, the shape property contains a tuple with 2 elements.\nmatrix = numpy.array([[5, 10, 15], [20, 25, 30]])\nprint(matrix.shape)\n\n\n# Each value in a NumPy array has to have the same data type\n# NumPy will automatically figure out an appropriate data type when reading in data or converting lists to arrays.\n# You can check the data type of a NumPy array using the dtype property.\nnumbers = numpy.array([1, 2, 3, 4])\nnumbers.dtype\n\n\n# When NumPy can't convert a value to a numeric data type like float or integer, it uses a special nan value that stands for Not a Number\n# nan is the missing data\n# 1.98600000e+03 is actually 1.986 * 10 ^ 3\nworld_alcohol\n\n\nworld_alcohol = numpy.genfromtxt(\n \"world_alcohol.txt\", delimiter=\",\", dtype=\"U75\", skip_header=1)\nprint(world_alcohol)\n\n\nuruguay_other_1986 = world_alcohol[1, 4]\nthird_country = world_alcohol[2, 2]\nprint(uruguay_other_1986)\nprint(third_country)\n\n\nvector = numpy.array([5, 10, 15, 20])\nprint(vector[0:3])\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[:, 1])\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[:, 0:2])\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nprint(matrix[1:3, 0:2])\n\n\n# it will compare the second value to each element in the vector\n# If the values are equal, the Python interpreter returns True; otherwise, it returns False\nvector = numpy.array([5, 10, 15, 20])\nvector == 10\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix == 25\n\n\n# Compares vector to the value 10, which generates a new Boolean vector [False, True, False, False]. It assigns this result to equal_to_ten\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten = (vector == 10)\nprint(equal_to_ten)\nprint(vector[equal_to_ten])\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nsecond_column_25 = (matrix[:, 1] == 25)\nprint(second_column_25)\nprint(matrix[second_column_25, :])\n\n\n# We can also perform comparisons with multiple conditions\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_and_five = (vector == 10) & (vector == 5)\nprint(equal_to_ten_and_five)\n\n\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_or_five = (vector == 10) | (vector == 5)\nprint(equal_to_ten_or_five)\n\n\nvector = numpy.array([5, 10, 15, 20])\nequal_to_ten_or_five = (vector == 10) | (vector == 5)\nvector[equal_to_ten_or_five] = 50\nprint(vector)\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nsecond_column_25 = matrix[:, 1] == 25\nprint(second_column_25)\nmatrix[second_column_25, 1] = 10\nprint(matrix)\n\n\n# We can convert the data type of an array with the ndarray.astype() method.\nvector = numpy.array([\"1\", \"2\", \"3\"])\nprint(vector.dtype)\nprint(vector)\nvector = vector.astype(float)\nprint(vector.dtype)\nprint(vector)\n\n\nvector = numpy.array([5, 10, 15, 20])\nvector.sum()\n\n\n# The axis dictates which dimension we perform the operation on\n# 1 means that we want to perform the operation on each row, and 0 means on each column\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix.sum(axis=1)\n\n\nmatrix = numpy.array([\n [5, 10, 15],\n [20, 25, 30],\n [35, 40, 45]\n])\nmatrix.sum(axis=0)\n\n\n# replace nan value with 0\nworld_alcohol = numpy.genfromtxt(\"world_alcohol.txt\", delimiter=\",\")\n# print world_alcohol\nis_value_empty = numpy.isnan(world_alcohol[:, 4])\n# print is_value_empty\nworld_alcohol[is_value_empty, 4] = '0'\nalcohol_consumption = world_alcohol[:, 4]\nalcohol_consumption = alcohol_consumption.astype(float)\ntotal_alcohol = alcohol_consumption.sum()\naverage_alcohol = alcohol_consumption.mean()\nprint(total_alcohol)\nprint(average_alcohol)\n\n\n\n\n\na = np.arange(15).reshape(3, 5)\na\n\n\na.shape\n\n\n# the number of axes (dimensions) of the array\na.ndim\n\n\na.dtype.name\n\n\n# the total number of elements of the array\na.size\n\n\nnp.zeros((3, 4))\n\n\nnp.ones((2, 3, 4), dtype=np.int32)\n\n", "project_metadata": {"full_name": "zhaohuicici/Python-lib", "description": "Python --numpy,pandas,matplotlib\u7b49\u5b66\u4e60\u7b14\u8bb0", "topics": [], "git_url": "git://github.com/zhaohuicici/Python-lib.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2017-10-23T05:10:04Z", "size": 1343, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1005990}, "last_updated": "2020-12-07T07:54:23Z"}, "intent": "# create sequences of numbers"}, {"original_comment": "# Reset index to turn the results into a DataFrame\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Marketing Campaign and Business Analytics\n\n#%%\n\nfrom scipy.stats import ttest_ind\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# # Business KPI Understanding & Meanings\n\n#%%\n\n# Import marketing.csv with date columns\nmarketing = pd.read_csv('marketing1.csv',\n parse_dates=['date_served', 'date_subscribed', 'date_canceled'])\n# Data Summary\nprint(marketing.head(5))\nprint(marketing.info())\nprint(marketing.describe())\n\n\n# ## Feature Engineering and Aligning Data\n\n#%%\n\n# Check the data type of is_retained\nprint(marketing['is_retained'].dtype)\n\n# Convert is_retained to a boolean\nmarketing['is_retained'] = marketing['is_retained'].astype('bool')\n\n# Check the data type of is_retained, again\nprint(marketing['is_retained'].dtype)\n\n#%%\n\n# Mapping for channels\nchannel_dict = {\"House Ads\": 1, \"Instagram\": 2,\n \"Facebook\": 3, \"Email\": 4, \"Push\": 5}\n\n# Map the channel to a channel code\nmarketing['channel_code'] = marketing['subscribing_channel'].map(channel_dict)\n\n# Add the new column is_correct_lang to check if language preferred and the language of advertisement were safe\nmarketing['is_correct_lang'] = np.where(\n marketing['language_preferred'] == marketing['language_displayed'], \"Yes\", \"No\")\n\n# Add a DoW column (Day of Week Column)\nmarketing['DoW'] = marketing['date_subscribed'].dt.dayofweek\n\nprint(marketing.head(3))\n\n\n# # Exploratory Analysis\n\n#%%\n\n# Count the number of users who saw marketing advertisements each day\n# Group by date_served and count number of unique user_id's\ndaily_users = marketing.groupby(['date_served'])['user_id'].nunique()\n\n# Print head of daily_users\nprint(daily_users.head())\n\n# Plot daily_subscribers\ndaily_users.plot()\n\n# Include a title and y-axis label\nplt.title('Daily users')\nplt.ylabel('Number of users')\nplt.ylabel('Date of Advertisment served')\n\n# Rotate the x-axis labels by 45 degrees\nplt.xticks(rotation=45)\n\n# Display the plot\nplt.show()\n\n\n# # Common Marketing Metrics\n# ## Conversion Rate & Retention Rate\n#\n#\n# 1) Was the campaign Successfull ?
\n#   a) Converstion Rate - How many bought the product?
\n#     Formula = People Who bought Product/No. of People who Market To
\n#   b) Retention Rate - Can be measured only after 90 Days
\n#\n\n#%%\n\n# Calculate the number of people we marketed to\ntotal = marketing['user_id'].nunique()\n\n# Calculate the number of people who subscribed\nsubscribers = marketing[marketing[\"converted\"] == True]['user_id'].nunique()\n\n# Calculate the conversion rate\nconversion_rate = subscribers/total\nprint(\"Conversion rate :{} {}\".format(round(conversion_rate*100, 2), \"%\"))\n\n\n# Calculate the number of people who remained subscribed\nretained = marketing[marketing['is_retained'] == True]['user_id'].nunique()\n\n# Calculate the retention rate\nretention_rate = retained/subscribers\nprint(\"Retention Rate rate :{} {}\".format(round(retention_rate*100, 2), \"%\"))\n\n\n# ## Customer Segmentation\n#\n# 1) Age
\n# 2) Gender
\n# 3) Location
\n# 4) Past Interactions with Business
\n# 5) Marketing Channels user interacted with
\n#\n# ### Language Conversion Rate\n\n#%%\n\n# Group by language_displayed and count unique users\ntotal = marketing.groupby(['language_displayed'])['user_id'].nunique()\n\n# Group by language_displayed and count unique conversions\nsubscribers = marketing[marketing['converted'] == True] .groupby(\n ['language_displayed'])['user_id'].nunique()\n\n# Calculate the conversion rate for all languages\nlanguage_conversion_rate = subscribers/total\nprint(language_conversion_rate)\n\n# Create a bar chart using language_conversion_rate DataFrame\nlanguage_conversion_rate.plot(kind='bar')\n\n# Add a title and x and y-axis labels\nplt.title('Conversion rate by language\\n', size=16)\nplt.ylabel('Conversion rate (%)', size=14)\nplt.xlabel('Language', size=14)\n\n# Display the plot\nplt.show()\n\n\n# ### Conversion Rate on Differenet Dates\n\n#%%\n\n# Group by date_served and count unique users\ntotal = marketing.groupby(['date_served'])[\n 'user_id'] .nunique()\n\n# Group by date_served and calculate subscribers\nsubscribers = marketing[marketing['converted'] == True] .groupby(\n ['date_served'])['user_id'].nunique()\n\n# Calculate the conversion rate for all languages\ndaily_conversion_rate = subscribers/total\nprint(daily_conversion_rate)", "target_code": "daily_conversion_rate = pd.DataFrame(daily_conversion_rate.reset_index(0))\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Marketing Campaign and Business Analytics\n\n\nfrom scipy.stats import ttest_ind\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# # Business KPI Understanding & Meanings\n\n\n# Import marketing.csv with date columns\nmarketing = pd.read_csv('marketing1.csv',\n parse_dates=['date_served', 'date_subscribed', 'date_canceled'])\n# Data Summary\nprint(marketing.head(5))\nprint(marketing.info())\nprint(marketing.describe())\n\n\n# ## Feature Engineering and Aligning Data\n\n\n# Check the data type of is_retained\nprint(marketing['is_retained'].dtype)\n\n# Convert is_retained to a boolean\nmarketing['is_retained'] = marketing['is_retained'].astype('bool')\n\n# Check the data type of is_retained, again\nprint(marketing['is_retained'].dtype)\n\n\n# Mapping for channels\nchannel_dict = {\"House Ads\": 1, \"Instagram\": 2,\n \"Facebook\": 3, \"Email\": 4, \"Push\": 5}\n\n# Map the channel to a channel code\nmarketing['channel_code'] = marketing['subscribing_channel'].map(channel_dict)\n\n# Add the new column is_correct_lang to check if language preferred and the language of advertisement were safe\nmarketing['is_correct_lang'] = np.where(\n marketing['language_preferred'] == marketing['language_displayed'], \"Yes\", \"No\")\n\n# Add a DoW column (Day of Week Column)\nmarketing['DoW'] = marketing['date_subscribed'].dt.dayofweek\n\nprint(marketing.head(3))\n\n\n# # Exploratory Analysis\n\n\n# Count the number of users who saw marketing advertisements each day\n# Group by date_served and count number of unique user_id's\ndaily_users = marketing.groupby(['date_served'])['user_id'].nunique()\n\n# Print head of daily_users\nprint(daily_users.head())\n\n# Plot daily_subscribers\ndaily_users.plot()\n\n# Include a title and y-axis label\nplt.title('Daily users')\nplt.ylabel('Number of users')\nplt.ylabel('Date of Advertisment served')\n\n# Rotate the x-axis labels by 45 degrees\nplt.xticks(rotation=45)\n\n# Display the plot\nplt.show()\n\n\n# # Common Marketing Metrics\n# ## Conversion Rate & Retention Rate\n#\n#\n# 1) Was the campaign Successfull ?
\n#   a) Converstion Rate - How many bought the product?
\n#     Formula = People Who bought Product/No. of People who Market To
\n#   b) Retention Rate - Can be measured only after 90 Days
\n#\n\n\n# Calculate the number of people we marketed to\ntotal = marketing['user_id'].nunique()\n\n# Calculate the number of people who subscribed\nsubscribers = marketing[marketing[\"converted\"] == True]['user_id'].nunique()\n\n# Calculate the conversion rate\nconversion_rate = subscribers/total\nprint(\"Conversion rate :{} {}\".format(round(conversion_rate*100, 2), \"%\"))\n\n\n# Calculate the number of people who remained subscribed\nretained = marketing[marketing['is_retained'] == True]['user_id'].nunique()\n\n# Calculate the retention rate\nretention_rate = retained/subscribers\nprint(\"Retention Rate rate :{} {}\".format(round(retention_rate*100, 2), \"%\"))\n\n\n# ## Customer Segmentation\n#\n# 1) Age
\n# 2) Gender
\n# 3) Location
\n# 4) Past Interactions with Business
\n# 5) Marketing Channels user interacted with
\n#\n# ### Language Conversion Rate\n\n\n# Group by language_displayed and count unique users\ntotal = marketing.groupby(['language_displayed'])['user_id'].nunique()\n\n# Group by language_displayed and count unique conversions\nsubscribers = marketing[marketing['converted'] == True] .groupby(\n ['language_displayed'])['user_id'].nunique()\n\n# Calculate the conversion rate for all languages\nlanguage_conversion_rate = subscribers/total\nprint(language_conversion_rate)\n\n# Create a bar chart using language_conversion_rate DataFrame\nlanguage_conversion_rate.plot(kind='bar')\n\n# Add a title and x and y-axis labels\nplt.title('Conversion rate by language\\n', size=16)\nplt.ylabel('Conversion rate (%)', size=14)\nplt.xlabel('Language', size=14)\n\n# Display the plot\nplt.show()\n\n\n# ### Conversion Rate on Differenet Dates\n\n\n# Group by date_served and count unique users\ntotal = marketing.groupby(['date_served'])[\n 'user_id'] .nunique()\n\n# Group by date_served and calculate subscribers\nsubscribers = marketing[marketing['converted'] == True] .groupby(\n ['date_served'])['user_id'].nunique()\n\n# Calculate the conversion rate for all languages\ndaily_conversion_rate = subscribers/total\nprint(daily_conversion_rate)\n", "project_metadata": {"full_name": "KshitizSharmaV/DataScience_In_Investment_Banking", "description": "Data Science In Investment Banking", "topics": [], "git_url": "git://github.com/KshitizSharmaV/DataScience_In_Investment_Banking.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2019-05-31T07:49:12Z", "size": 6682, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6779969}, "last_updated": "2020-09-18T23:07:23Z"}, "intent": "# Reset index to turn the results into a DataFrame"}, {"original_comment": "# Concatenate with the TextCounts variables\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Introduction\n# In this notebook I will explore techniques to use text data as input for a classification. The [Twitter US Airline Sentiment data set](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) looks like a nice data set to work with. So let's take off!\n\n# # Importing modules\n\n#%%\n\nimport warnings\nfrom nltk.tokenize import word_tokenize\nfrom nltk.stem import PorterStemmer\nfrom nltk.corpus import stopwords\nimport gensim\nfrom sklearn.externals import joblib\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline, FeatureUnion\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.base import BaseEstimator, TransformerMixin\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport collections\nfrom pprint import pprint\nimport emoji\nimport os\nimport string\nimport re\nfrom time import time\nimport numpy as np\nimport pandas as pd\npd.set_option('display.max_colwidth', -1)\n\nsns.set(style=\"darkgrid\")\nsns.set(font_scale=1.3)\n\n\nwarnings.filterwarnings('ignore')\n\nnp.random.seed(37)\nKAGGLE_ENV = os.getcwd == '/kaggle/working'\n\n\n# # Loading the data\n# We shuffle the data frame in case the classes would be sorted. This can be done with the **reindex** method applied on the **permutation** of the original indices. In this notebook we will only focus on the text variable and the class variable.\n\n#%%\n\ndf = pd.read_csv('../input/Tweets.csv')\ndf = df.reindex(np.random.permutation(df.index))\ndf = df[['text', 'airline_sentiment']]\n\n\n# # Exploratory Data Analysis\n\n# ## Target variable\n# There are three class labels to predict: *negative, neutral or positive*.\n#\n# **CONCLUSION: **The class labels are **imbalanced** as we can see below. This is something that we should keep in mind during the model training phase. We could, for instance, make sure the classes are balanced by up/undersampling.\n\n#%%\n\ntarget_dist = sns.factorplot(x=\"airline_sentiment\", data=df,\n kind=\"count\", size=6, aspect=1.5, palette=\"PuBuGn_d\")\nplt.show()\nif not KAGGLE_ENV:\n target_dist.savefig('../output/target_dist.png')\n\n\n# ## Text variable\n# To analyze the text variable we create a class **TextCounts**. In this class we compute some basic statistics on the text variable. This class can be used later in a Pipeline, as well.\n#\n# * **count_words** : number of words in the tweet\n# * **count_mentions** : referrals to other Twitter accounts, which are preceded by a @\n# * **count_hashtags** : number of tag words, preceded by a #\n# * **count_capital_words** : number of uppercase words, could be used to *\"shout\"* and express (negative) emotions\n# * **count_excl_quest_marks** : number of question or exclamation marks\n# * **count_urls** : number of links in the tweet, preceded by http(s)\n# * **count_emojis** : number of emoji, which might be a good indication of the sentiment\n\n#%%\n\nclass TextCounts(BaseEstimator, TransformerMixin):\n\n def count_regex(self, pattern, tweet):\n return len(re.findall(pattern, tweet))\n\n def fit(self, X, y=None, **fit_params):\n # fit method is used when specific operations need to be done on the train data, but not on the test data\n return self\n\n def transform(self, X, **transform_params):\n count_words = X.apply(lambda x: self.count_regex(r'\\w+', x))\n count_mentions = X.apply(lambda x: self.count_regex(r'@\\w+', x))\n count_hashtags = X.apply(lambda x: self.count_regex(r'#\\w+', x))\n count_capital_words = X.apply(\n lambda x: self.count_regex(r'\\b[A-Z]{2,}\\b', x))\n count_excl_quest_marks = X.apply(\n lambda x: self.count_regex(r'!|\\?', x))\n count_urls = X.apply(lambda x: self.count_regex(\n r'http.?://[^\\s]+[\\s]?', x))\n # We will replace the emoji symbols with a description, which makes using a regex for counting easier\n # Moreover, it will result in having more words in the tweet\n count_emojis = X.apply(lambda x: emoji.demojize(x)).apply(\n lambda x: self.count_regex(r':[a-z_&]+:', x))\n\n df = pd.DataFrame({'count_words': count_words, 'count_mentions': count_mentions, 'count_hashtags': count_hashtags, 'count_capital_words': count_capital_words, 'count_excl_quest_marks': count_excl_quest_marks, 'count_urls': count_urls, 'count_emojis': count_emojis\n })\n\n return df\n\n#%%\n\ntc = TextCounts()\ndf_eda = tc.fit_transform(df.text)\n# Add airline_sentiment to df_eda\ndf_eda['airline_sentiment'] = df.airline_sentiment\n\n\n# It could be interesting to see how the TextStats variables relate to the class variable. Therefore we write a function **show_dist** that provides descriptive statistics and a plot per target class.\n\n#%%\n\ndef show_dist(df, col):\n print('Descriptive stats for {}'.format(col))\n print('-'*(len(col)+22))\n print(df.groupby('airline_sentiment')[col].describe())\n bins = np.arange(df[col].min(), df[col].max() + 1)\n g = sns.FacetGrid(df, col='airline_sentiment', size=5,\n hue='airline_sentiment', palette=\"PuBuGn_d\")\n g = g.map(sns.distplot, col, kde=False, norm_hist=True, bins=bins)\n plt.show()\n if not KAGGLE_ENV:\n g.savefig('../output/' + col + '_dist.png')\n\n#%%\n\nshow_dist(df_eda, 'count_words')\n\n#%%\n\nshow_dist(df_eda, 'count_mentions')\n\n#%%\n\nshow_dist(df_eda, 'count_hashtags')\n\n#%%\n\nshow_dist(df_eda, 'count_capital_words')\n\n#%%\n\nshow_dist(df_eda, 'count_excl_quest_marks')\n\n#%%\n\nshow_dist(df_eda, 'count_urls')\n\n#%%\n\nshow_dist(df_eda, 'count_emojis')\n\n\n# **CONCLUSIONS: **\n# * **The number of words** used in the tweets is rater low. Maximum number of words is 36 and there are even tweets with only 2 words. So we'll have to be careful during data cleaning not to remove too many words. On the other hand, the text processing will be faster. Negative tweets contain more words than neutral or positive tweets.\n# * All tweets have at least one **mention**. Probably this is the result of extracting the tweets based on mentions in the Twitter data. There seems to be no difference in number of mentions with regard to the sentiment.\n# * Most of the tweets do not contain **hash tags**. So probably this variable will not be retained during model training. Again, no difference in number of hash tags with regard to the sentiment.\n# * Most of the tweets do not contain **capitalized words** and we do not see a difference in distribution between the sentiments.\n# * The positive tweets seem to be using a bit more **exclamation or question marks**.\n# * Most tweets do not contain a **URL**.\n# * Most tweets do not use **emojis**.\n\n# # Text Cleaning\n# Before we start using the tweets' text we clean it. We'll do the this in the class CleanText:\n# - remove the **mentions**, as we want to make the model generalisable to tweets of other airline companies too.\n# - remove the **hash tag sign** (#) but not the actual tag as this may contain information\n# - set all words to **lowercase**\n# - remove all **punctuations**, including the question and exclamation marks\n# - remove the **urls** as they do not contain useful information and we did not notice a distinction in the number of urls used between the sentiment classes\n# - make sure the converted **emojis** are kept as one word.\n# - remove **digits**\n# - remove **stopwords**\n# - apply the **PorterStemmer** to keep the stem of the words\n\n#%%\n\nclass CleanText(BaseEstimator, TransformerMixin):\n\n def remove_mentions(self, input_text):\n return re.sub(r'@\\w+', '', input_text)\n\n def remove_urls(self, input_text):\n return re.sub(r'http.?://[^\\s]+[\\s]?', '', input_text)\n\n def emoji_oneword(self, input_text):\n # By compressing the underscore, the emoji is kept as one word\n return input_text.replace('_', '')\n\n def remove_punctuation(self, input_text):\n # Make translation table\n punct = string.punctuation\n # Every punctuation symbol will be replaced by a space\n trantab = str.maketrans(punct, len(punct)*' ')\n return input_text.translate(trantab)\n\n def remove_digits(self, input_text):\n return re.sub('\\d+', '', input_text)\n\n def to_lower(self, input_text):\n return input_text.lower()\n\n def remove_stopwords(self, input_text):\n stopwords_list = stopwords.words('english')\n # Some words which might indicate a certain sentiment are kept via a whitelist\n whitelist = [\"n't\", \"not\", \"no\"]\n words = input_text.split()\n clean_words = [word for word in words if (\n word not in stopwords_list or word in whitelist) and len(word) > 1]\n return \" \".join(clean_words)\n\n def stemming(self, input_text):\n porter = PorterStemmer()\n words = input_text.split()\n stemmed_words = [porter.stem(word) for word in words]\n return \" \".join(stemmed_words)\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n def transform(self, X, **transform_params):\n clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(\n self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords).apply(self.stemming)\n return clean_X\n\n\n# To show how the cleaned text variable will look like, here's a sample.\n\n#%%\n\nct = CleanText()\nsr_clean = ct.fit_transform(df.text)\nsr_clean.sample(5)\n\n\n# **NOTE: **One side-effect of text cleaning is that some rows do not have any words left in their text. For the CountVectorizer and TfIdfVectorizer this does not really pose a problem. However, for the Word2Vec algorithm this causes an error. There are different strategies that you could apply to deal with these missing values.\n#\n# * Remove the complete row, but in a production environment this is not really desirable.\n# * Impute the missing value with some placeholder text like *[no_text]*\n# * Word2Vec: use the average of all vectors\n#\n# Here we will impute with a placeholder text.\n\n#%%\n\nempty_clean = sr_clean == ''\nprint('{} records have no words left after text cleaning'.format(\n sr_clean[empty_clean].count()))\nsr_clean.loc[empty_clean] = '[no_text]'\n\n\n# Now that we have the cleaned text of the tweets, we can have a look at what are the most frequent words. Below we'll show the top 20 words.\n#\n# **CONCLUSION: **Not surprisingly the most frequent word is *flight*.\n\n#%%\n\ncv = CountVectorizer()\nbow = cv.fit_transform(sr_clean)\nword_freq = dict(zip(cv.get_feature_names(),\n np.asarray(bow.sum(axis=0)).ravel()))\nword_counter = collections.Counter(word_freq)\nword_counter_df = pd.DataFrame(\n word_counter.most_common(20), columns=['word', 'freq'])\n\nfig, ax = plt.subplots(figsize=(12, 10))\nbar_freq_word = sns.barplot(\n x=\"word\", y=\"freq\", data=word_counter_df, palette=\"PuBuGn_d\", ax=ax)\nplt.show()\nif not KAGGLE_ENV:\n bar_freq_word.get_figure().savefig('../output/bar_freq_word.png')\n\n\n# # Creating test data\n# To evaluate the trained models we'll need a **test set**. Evaluating on the train data would not be correct because the models are trained to minimize their cost function.\n#\n# First we combine the TextCounts variables with the CleanText variable.\n#\n# **NOTE: **Initially, I made the mistake to do execute TextCounts and CleanText in the GridSearchCV below. This took too long as it applies these functions each run of the GridSearch. It suffices to run them only once.\n\n#%%\n\ndf_model = df_eda\ndf_model['clean_text'] = sr_clean\ndf_model.columns.tolist()\n\n\n# So df_model now contains several variables. However, our vectorizers (see below) will only need the *clean_text* variable. The TextCounts variables can be added as such. To specifically select columns, I wrote the class **ColumnExtractor** below. This can be used in the Pipeline afterwards.\n\n#%%\n\nclass ColumnExtractor(TransformerMixin, BaseEstimator):\n def __init__(self, cols):\n self.cols = cols\n\n def transform(self, X, **transform_params):\n return X[self.cols]\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(df_model.drop(\n 'airline_sentiment', axis=1), df_model.airline_sentiment, test_size=0.1, random_state=37)\n\n\n# ## Hyperparameter tuning and cross-validation\n# As we will see below, the vectorizers and classifiers all have configurable parameters. In order to chose the best parameters, we need to evaluate on a separate validation set that was not used during the training. However, using only one validation set may not produce reliable validation results. Due to chance you might have a good model performance on the validation set. If you would split the data otherwise, you might end up with other results. To get a more accurate estimation, we perform **cross-validation**.\n#\n# With cross-validation the data is split into a train and validation set multiple times. The evaluation metric is then averaged over the different folds. Luckily, GridSearchCV applies cross-validation out-of-the-box.\n#\n# To find the best parameters for both a vectorizer and classifier, we create a **Pipeline**. All this is put into a function for ease of use.\n#\n# ### Evaluation metrics\n# By default GridSearchCV uses the default scorer to compute the *best_score_*. For both the MultiNomialNb and LogisticRegression this default scoring metric is the accuracy.\n#\n# In our function *grid_vect* we additionally generate the *classification_report* on the test data. This provides some interesting metrics **per target class**, which might be more appropriate here. These metrics are the **precision, recal and F1 score.**\n#\n# * **Precision: ** Of all rows we predicted to be a certain class, how many did we correctly predict?\n# * **Recall: ** Of all rows of a certain class, how many did we correctly predict?\n# * **F1 score: ** Harmonic mean of Precision and Recall.\n#\n# Precision and Recall can be calculated with the elements of the [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)\n\n#%%\n\n# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html\ndef grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None, is_w2v=False):\n\n textcountscols = ['count_capital_words', 'count_emojis', 'count_excl_quest_marks',\n 'count_hashtags', 'count_mentions', 'count_urls', 'count_words']\n\n if is_w2v:\n w2vcols = []\n for i in range(SIZE):\n w2vcols.append(i)\n features = FeatureUnion([('textcounts', ColumnExtractor(\n cols=textcountscols)), ('w2v', ColumnExtractor(cols=w2vcols))], n_jobs=-1)\n else:\n features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols)), ('pipe', Pipeline(\n [('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))], n_jobs=-1)\n\n pipeline = Pipeline([\n ('features', features), ('clf', clf)\n ])\n\n # Join the parameters dictionaries together\n parameters = dict()\n if parameters_text:\n parameters.update(parameters_text)\n parameters.update(parameters_clf)\n\n # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics\n grid_search = GridSearchCV(\n pipeline, parameters, n_jobs=-1, verbose=1, cv=5)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n\n t0 = time()\n grid_search.fit(X_train, y_train)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best CV score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))\n\n print(\"Test score with best_estimator_: %0.3f\" %\n grid_search.best_estimator_.score(X_test, y_test))\n print(\"\\n\")\n print(\"Classification Report Test Data\")\n print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))\n\n return grid_search\n\n\n# ### Parameter grids for GridSearchCV\n\n#%%\n\n# Parameter grid settings for the vectorizers (Count and TFIDF)\nparameters_vect = {\n 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),\n 'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),\n 'features__pipe__vect__min_df': (1, 2)\n}\n\n\n# Parameter grid settings for MultinomialNB\nparameters_mnb = {\n 'clf__alpha': (0.25, 0.5, 0.75)\n}\n\n\n# Parameter grid settings for LogisticRegression\nparameters_logreg = {\n 'clf__C': (0.25, 0.5, 1.0),\n 'clf__penalty': ('l1', 'l2')\n}\n\n\n# ## Classifiers\n# Here we will compare the performance of a MultinomailNB and LogisticRegression.\n\n#%%\n\nmnb = MultinomialNB()\nlogreg = LogisticRegression()\n\n\n# ## CountVectorizer\n# To use words in a classifier, we need to convert the words to numbers. This can be done with a CountVectorizer. Sklearn's **CountVectorizer** takes all words in all tweets, assigns an ID and counts the frequency of the word per tweet. This *bag of words* can then be used as input for a classifier. It is what is called a **sparse** data set, meaning that each record will have many zeroes for the words not occurring in the tweet.\n\n#%%\n\ncountvect = CountVectorizer()\n\n#%%\n\n# MultinomialNB\nbest_mnb_countvect = grid_vect(\n mnb, parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=countvect)\nif not KAGGLE_ENV:\n joblib.dump(best_mnb_countvect, '../output/best_mnb_countvect.pkl')\n\n#%%\n\n# LogisticRegression\nbest_logreg_countvect = grid_vect(\n logreg, parameters_logreg, X_train, X_test, parameters_text=parameters_vect, vect=countvect)\nif not KAGGLE_ENV:\n joblib.dump(best_logreg_countvect, '../output/best_logreg_countvect.pkl')\n\n\n# ## TF-IDF\n# One issue with CountVectorizer is that there might be words that occur frequently in observations of the target classes. These words do not have discriminatory information and can be removed. [TF-IDF (term frequency - inverse document frequency)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) can be used to downweight these frequent words.\n\n#%%\n\ntfidfvect = TfidfVectorizer()\n\n#%%\n\n# MultinomialNB\nbest_mnb_tfidf = grid_vect(mnb, parameters_mnb, X_train,\n X_test, parameters_text=parameters_vect, vect=tfidfvect)\nif not KAGGLE_ENV:\n joblib.dump(best_mnb_tfidf, '../output/best_mnb_tfidf.pkl')\n\n#%%\n\n# LogisticRegression\nbest_logreg_tfidf = grid_vect(logreg, parameters_logreg, X_train,\n X_test, parameters_text=parameters_vect, vect=tfidfvect)\nif not KAGGLE_ENV:\n joblib.dump(best_logreg_tfidf, '../output/best_logreg_tfidf.pkl')\n\n\n# ## Word2Vec\n# Another way of converting the words in the tweets to numerical values can be achieved with Word2Vec. Word2Vec maps each word in a multi-dimensional space. It does this by taking into account the context in which a word appears in the tweets. As a result, words that are semantically similar are also close to each other in the multi-dimensional space.\n#\n# The Word2Vec algorithm is implemented in the [gensim](https://radimrehurek.com/gensim/models/word2vec.html) package.\n#\n# The Word2Vec algorithm uses lists of words as input. For that purpose we use the **word_tokenize** method of the the nltk package.\n\n#%%\n\nSIZE = 25\n\nX_train['clean_text_wordlist'] = X_train.clean_text.apply(\n lambda x: word_tokenize(x))\nX_test['clean_text_wordlist'] = X_test.clean_text.apply(\n lambda x: word_tokenize(x))\n\nmodel = gensim.models.Word2Vec(\n X_train.clean_text_wordlist, min_count=1, size=SIZE, window=3, workers=4)\n\n#%%\n\nmodel.most_similar('plane', topn=3)\n\n\n# The Word2Vec model provides a vocabulary of the words in the corpus together with their vector values. The number of vector values is equal to the chosen **size**. These are the dimensions on which each word is mapped in the multi-dimensional space.\n#\n# Words with an occurrence less than **min_count** are not kept in the vocabulary.\n# **NOTE: **A side effect of the **min_count** parameter is that some tweets could have no vector values. This is would be the case when the word(s) in the tweet occur in less than *min_count* tweets. Due to the small corpus of tweets, there is a risk of this happening in our case. Therefore we set the min_count value equal to 1.\n#\n# The tweets can have a different number of vectors, depending on the number of words it contains. To use this output for modeling we will aggregate the vectors per tweet to have the same number (i.e. *size*) of input variables per tweet. Therefore we will take the average of all vectors per tweet. We do this with the function **compute_avg_w2v_vector**. In this function we also check whether the words in the tweet occur in the vocabulary of the word2vec model. If not, a list filled with 0.0 is returned. Else the average of the word vectors.\n\n#%%\n\ndef compute_avg_w2v_vector(w2v_dict, tweet):\n list_of_word_vectors = [w2v_dict[w]\n for w in tweet if w in w2v_dict.vocab.keys()]\n\n if len(list_of_word_vectors) == 0:\n result = [0.0]*SIZE\n else:\n result = np.sum(list_of_word_vectors, axis=0) / \\\n len(list_of_word_vectors)\n\n return result\n\n#%%\n\nX_train_w2v = X_train['clean_text_wordlist'].apply(\n lambda x: compute_avg_w2v_vector(model.wv, x))\nX_test_w2v = X_test['clean_text_wordlist'].apply(\n lambda x: compute_avg_w2v_vector(model.wv, x))\n\n\n# This gives us a Series with a vector of dimension equal to SIZE. Now we will split this vector and create a DataFrame with each vector value in a separate column. That way we can concatenate the word2vec variables to the other TextCounts variables. We need to reuse the index of X_train and X_test respectively. Otherwise this will give issues (duplicates) in the concatenation later on.\n\n#%%\n\nX_train_w2v = pd.DataFrame(X_train_w2v.values.tolist(), index=X_train.index)\nX_test_w2v = pd.DataFrame(X_test_w2v.values.tolist(), index=X_test.index)", "target_code": "X_train_w2v = pd.concat([X_train_w2v, X_train.drop(\n ['clean_text', 'clean_text_wordlist'], axis=1)], axis=1)\nX_test_w2v = pd.concat([X_test_w2v, X_test.drop(\n ['clean_text', 'clean_text_wordlist'], axis=1)], axis=1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Introduction\n# In this notebook I will explore techniques to use text data as input for a classification. The [Twitter US Airline Sentiment data set](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) looks like a nice data set to work with. So let's take off!\n\n# # Importing modules\n\n\nimport warnings\nfrom nltk.tokenize import word_tokenize\nfrom nltk.stem import PorterStemmer\nfrom nltk.corpus import stopwords\nimport gensim\nfrom sklearn.externals import joblib\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline, FeatureUnion\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.base import BaseEstimator, TransformerMixin\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport collections\nfrom pprint import pprint\nimport emoji\nimport os\nimport string\nimport re\nfrom time import time\nimport numpy as np\nimport pandas as pd\npd.set_option('display.max_colwidth', -1)\n\nsns.set(style=\"darkgrid\")\nsns.set(font_scale=1.3)\n\n\nwarnings.filterwarnings('ignore')\n\nnp.random.seed(37)\nKAGGLE_ENV = os.getcwd == '/kaggle/working'\n\n\n# # Loading the data\n# We shuffle the data frame in case the classes would be sorted. This can be done with the **reindex** method applied on the **permutation** of the original indices. In this notebook we will only focus on the text variable and the class variable.\n\n\ndf = pd.read_csv('../input/Tweets.csv')\ndf = df.reindex(np.random.permutation(df.index))\ndf = df[['text', 'airline_sentiment']]\n\n\n# # Exploratory Data Analysis\n\n# ## Target variable\n# There are three class labels to predict: *negative, neutral or positive*.\n#\n# **CONCLUSION: **The class labels are **imbalanced** as we can see below. This is something that we should keep in mind during the model training phase. We could, for instance, make sure the classes are balanced by up/undersampling.\n\n\ntarget_dist = sns.factorplot(x=\"airline_sentiment\", data=df,\n kind=\"count\", size=6, aspect=1.5, palette=\"PuBuGn_d\")\nplt.show()\nif not KAGGLE_ENV:\n target_dist.savefig('../output/target_dist.png')\n\n\n# ## Text variable\n# To analyze the text variable we create a class **TextCounts**. In this class we compute some basic statistics on the text variable. This class can be used later in a Pipeline, as well.\n#\n# * **count_words** : number of words in the tweet\n# * **count_mentions** : referrals to other Twitter accounts, which are preceded by a @\n# * **count_hashtags** : number of tag words, preceded by a #\n# * **count_capital_words** : number of uppercase words, could be used to *\"shout\"* and express (negative) emotions\n# * **count_excl_quest_marks** : number of question or exclamation marks\n# * **count_urls** : number of links in the tweet, preceded by http(s)\n# * **count_emojis** : number of emoji, which might be a good indication of the sentiment\n\n\nclass TextCounts(BaseEstimator, TransformerMixin):\n\n def count_regex(self, pattern, tweet):\n return len(re.findall(pattern, tweet))\n\n def fit(self, X, y=None, **fit_params):\n # fit method is used when specific operations need to be done on the train data, but not on the test data\n return self\n\n def transform(self, X, **transform_params):\n count_words = X.apply(lambda x: self.count_regex(r'\\w+', x))\n count_mentions = X.apply(lambda x: self.count_regex(r'@\\w+', x))\n count_hashtags = X.apply(lambda x: self.count_regex(r'#\\w+', x))\n count_capital_words = X.apply(\n lambda x: self.count_regex(r'\\b[A-Z]{2,}\\b', x))\n count_excl_quest_marks = X.apply(\n lambda x: self.count_regex(r'!|\\?', x))\n count_urls = X.apply(lambda x: self.count_regex(\n r'http.?://[^\\s]+[\\s]?', x))\n # We will replace the emoji symbols with a description, which makes using a regex for counting easier\n # Moreover, it will result in having more words in the tweet\n count_emojis = X.apply(lambda x: emoji.demojize(x)).apply(\n lambda x: self.count_regex(r':[a-z_&]+:', x))\n\n df = pd.DataFrame({'count_words': count_words, 'count_mentions': count_mentions, 'count_hashtags': count_hashtags, 'count_capital_words': count_capital_words, 'count_excl_quest_marks': count_excl_quest_marks, 'count_urls': count_urls, 'count_emojis': count_emojis\n })\n\n return df\n\n\ntc = TextCounts()\ndf_eda = tc.fit_transform(df.text)\n# Add airline_sentiment to df_eda\ndf_eda['airline_sentiment'] = df.airline_sentiment\n\n\n# It could be interesting to see how the TextStats variables relate to the class variable. Therefore we write a function **show_dist** that provides descriptive statistics and a plot per target class.\n\n\ndef show_dist(df, col):\n print('Descriptive stats for {}'.format(col))\n print('-'*(len(col)+22))\n print(df.groupby('airline_sentiment')[col].describe())\n bins = np.arange(df[col].min(), df[col].max() + 1)\n g = sns.FacetGrid(df, col='airline_sentiment', size=5,\n hue='airline_sentiment', palette=\"PuBuGn_d\")\n g = g.map(sns.distplot, col, kde=False, norm_hist=True, bins=bins)\n plt.show()\n if not KAGGLE_ENV:\n g.savefig('../output/' + col + '_dist.png')\n\n\nshow_dist(df_eda, 'count_words')\n\n\nshow_dist(df_eda, 'count_mentions')\n\n\nshow_dist(df_eda, 'count_hashtags')\n\n\nshow_dist(df_eda, 'count_capital_words')\n\n\nshow_dist(df_eda, 'count_excl_quest_marks')\n\n\nshow_dist(df_eda, 'count_urls')\n\n\nshow_dist(df_eda, 'count_emojis')\n\n\n# **CONCLUSIONS: **\n# * **The number of words** used in the tweets is rater low. Maximum number of words is 36 and there are even tweets with only 2 words. So we'll have to be careful during data cleaning not to remove too many words. On the other hand, the text processing will be faster. Negative tweets contain more words than neutral or positive tweets.\n# * All tweets have at least one **mention**. Probably this is the result of extracting the tweets based on mentions in the Twitter data. There seems to be no difference in number of mentions with regard to the sentiment.\n# * Most of the tweets do not contain **hash tags**. So probably this variable will not be retained during model training. Again, no difference in number of hash tags with regard to the sentiment.\n# * Most of the tweets do not contain **capitalized words** and we do not see a difference in distribution between the sentiments.\n# * The positive tweets seem to be using a bit more **exclamation or question marks**.\n# * Most tweets do not contain a **URL**.\n# * Most tweets do not use **emojis**.\n\n# # Text Cleaning\n# Before we start using the tweets' text we clean it. We'll do the this in the class CleanText:\n# - remove the **mentions**, as we want to make the model generalisable to tweets of other airline companies too.\n# - remove the **hash tag sign** (#) but not the actual tag as this may contain information\n# - set all words to **lowercase**\n# - remove all **punctuations**, including the question and exclamation marks\n# - remove the **urls** as they do not contain useful information and we did not notice a distinction in the number of urls used between the sentiment classes\n# - make sure the converted **emojis** are kept as one word.\n# - remove **digits**\n# - remove **stopwords**\n# - apply the **PorterStemmer** to keep the stem of the words\n\n\nclass CleanText(BaseEstimator, TransformerMixin):\n\n def remove_mentions(self, input_text):\n return re.sub(r'@\\w+', '', input_text)\n\n def remove_urls(self, input_text):\n return re.sub(r'http.?://[^\\s]+[\\s]?', '', input_text)\n\n def emoji_oneword(self, input_text):\n # By compressing the underscore, the emoji is kept as one word\n return input_text.replace('_', '')\n\n def remove_punctuation(self, input_text):\n # Make translation table\n punct = string.punctuation\n # Every punctuation symbol will be replaced by a space\n trantab = str.maketrans(punct, len(punct)*' ')\n return input_text.translate(trantab)\n\n def remove_digits(self, input_text):\n return re.sub('\\d+', '', input_text)\n\n def to_lower(self, input_text):\n return input_text.lower()\n\n def remove_stopwords(self, input_text):\n stopwords_list = stopwords.words('english')\n # Some words which might indicate a certain sentiment are kept via a whitelist\n whitelist = [\"n't\", \"not\", \"no\"]\n words = input_text.split()\n clean_words = [word for word in words if (\n word not in stopwords_list or word in whitelist) and len(word) > 1]\n return \" \".join(clean_words)\n\n def stemming(self, input_text):\n porter = PorterStemmer()\n words = input_text.split()\n stemmed_words = [porter.stem(word) for word in words]\n return \" \".join(stemmed_words)\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n def transform(self, X, **transform_params):\n clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(\n self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords).apply(self.stemming)\n return clean_X\n\n\n# To show how the cleaned text variable will look like, here's a sample.\n\n\nct = CleanText()\nsr_clean = ct.fit_transform(df.text)\nsr_clean.sample(5)\n\n\n# **NOTE: **One side-effect of text cleaning is that some rows do not have any words left in their text. For the CountVectorizer and TfIdfVectorizer this does not really pose a problem. However, for the Word2Vec algorithm this causes an error. There are different strategies that you could apply to deal with these missing values.\n#\n# * Remove the complete row, but in a production environment this is not really desirable.\n# * Impute the missing value with some placeholder text like *[no_text]*\n# * Word2Vec: use the average of all vectors\n#\n# Here we will impute with a placeholder text.\n\n\nempty_clean = sr_clean == ''\nprint('{} records have no words left after text cleaning'.format(\n sr_clean[empty_clean].count()))\nsr_clean.loc[empty_clean] = '[no_text]'\n\n\n# Now that we have the cleaned text of the tweets, we can have a look at what are the most frequent words. Below we'll show the top 20 words.\n#\n# **CONCLUSION: **Not surprisingly the most frequent word is *flight*.\n\n\ncv = CountVectorizer()\nbow = cv.fit_transform(sr_clean)\nword_freq = dict(zip(cv.get_feature_names(),\n np.asarray(bow.sum(axis=0)).ravel()))\nword_counter = collections.Counter(word_freq)\nword_counter_df = pd.DataFrame(\n word_counter.most_common(20), columns=['word', 'freq'])\n\nfig, ax = plt.subplots(figsize=(12, 10))\nbar_freq_word = sns.barplot(\n x=\"word\", y=\"freq\", data=word_counter_df, palette=\"PuBuGn_d\", ax=ax)\nplt.show()\nif not KAGGLE_ENV:\n bar_freq_word.get_figure().savefig('../output/bar_freq_word.png')\n\n\n# # Creating test data\n# To evaluate the trained models we'll need a **test set**. Evaluating on the train data would not be correct because the models are trained to minimize their cost function.\n#\n# First we combine the TextCounts variables with the CleanText variable.\n#\n# **NOTE: **Initially, I made the mistake to do execute TextCounts and CleanText in the GridSearchCV below. This took too long as it applies these functions each run of the GridSearch. It suffices to run them only once.\n\n\ndf_model = df_eda\ndf_model['clean_text'] = sr_clean\ndf_model.columns.tolist()\n\n\n# So df_model now contains several variables. However, our vectorizers (see below) will only need the *clean_text* variable. The TextCounts variables can be added as such. To specifically select columns, I wrote the class **ColumnExtractor** below. This can be used in the Pipeline afterwards.\n\n\nclass ColumnExtractor(TransformerMixin, BaseEstimator):\n def __init__(self, cols):\n self.cols = cols\n\n def transform(self, X, **transform_params):\n return X[self.cols]\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n\nX_train, X_test, y_train, y_test = train_test_split(df_model.drop(\n 'airline_sentiment', axis=1), df_model.airline_sentiment, test_size=0.1, random_state=37)\n\n\n# ## Hyperparameter tuning and cross-validation\n# As we will see below, the vectorizers and classifiers all have configurable parameters. In order to chose the best parameters, we need to evaluate on a separate validation set that was not used during the training. However, using only one validation set may not produce reliable validation results. Due to chance you might have a good model performance on the validation set. If you would split the data otherwise, you might end up with other results. To get a more accurate estimation, we perform **cross-validation**.\n#\n# With cross-validation the data is split into a train and validation set multiple times. The evaluation metric is then averaged over the different folds. Luckily, GridSearchCV applies cross-validation out-of-the-box.\n#\n# To find the best parameters for both a vectorizer and classifier, we create a **Pipeline**. All this is put into a function for ease of use.\n#\n# ### Evaluation metrics\n# By default GridSearchCV uses the default scorer to compute the *best_score_*. For both the MultiNomialNb and LogisticRegression this default scoring metric is the accuracy.\n#\n# In our function *grid_vect* we additionally generate the *classification_report* on the test data. This provides some interesting metrics **per target class**, which might be more appropriate here. These metrics are the **precision, recal and F1 score.**\n#\n# * **Precision: ** Of all rows we predicted to be a certain class, how many did we correctly predict?\n# * **Recall: ** Of all rows of a certain class, how many did we correctly predict?\n# * **F1 score: ** Harmonic mean of Precision and Recall.\n#\n# Precision and Recall can be calculated with the elements of the [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)\n\n\n# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html\ndef grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None, is_w2v=False):\n\n textcountscols = ['count_capital_words', 'count_emojis', 'count_excl_quest_marks',\n 'count_hashtags', 'count_mentions', 'count_urls', 'count_words']\n\n if is_w2v:\n w2vcols = []\n for i in range(SIZE):\n w2vcols.append(i)\n features = FeatureUnion([('textcounts', ColumnExtractor(\n cols=textcountscols)), ('w2v', ColumnExtractor(cols=w2vcols))], n_jobs=-1)\n else:\n features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols)), ('pipe', Pipeline(\n [('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))], n_jobs=-1)\n\n pipeline = Pipeline([\n ('features', features), ('clf', clf)\n ])\n\n # Join the parameters dictionaries together\n parameters = dict()\n if parameters_text:\n parameters.update(parameters_text)\n parameters.update(parameters_clf)\n\n # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics\n grid_search = GridSearchCV(\n pipeline, parameters, n_jobs=-1, verbose=1, cv=5)\n\n print(\"Performing grid search...\")\n print(\"pipeline:\", [name for name, _ in pipeline.steps])\n print(\"parameters:\")\n pprint(parameters)\n\n t0 = time()\n grid_search.fit(X_train, y_train)\n print(\"done in %0.3fs\" % (time() - t0))\n print()\n\n print(\"Best CV score: %0.3f\" % grid_search.best_score_)\n print(\"Best parameters set:\")\n best_parameters = grid_search.best_estimator_.get_params()\n for param_name in sorted(parameters.keys()):\n print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))\n\n print(\"Test score with best_estimator_: %0.3f\" %\n grid_search.best_estimator_.score(X_test, y_test))\n print(\"\\n\")\n print(\"Classification Report Test Data\")\n print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))\n\n return grid_search\n\n\n# ### Parameter grids for GridSearchCV\n\n\n# Parameter grid settings for the vectorizers (Count and TFIDF)\nparameters_vect = {\n 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),\n 'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),\n 'features__pipe__vect__min_df': (1, 2)\n}\n\n\n# Parameter grid settings for MultinomialNB\nparameters_mnb = {\n 'clf__alpha': (0.25, 0.5, 0.75)\n}\n\n\n# Parameter grid settings for LogisticRegression\nparameters_logreg = {\n 'clf__C': (0.25, 0.5, 1.0),\n 'clf__penalty': ('l1', 'l2')\n}\n\n\n# ## Classifiers\n# Here we will compare the performance of a MultinomailNB and LogisticRegression.\n\n\nmnb = MultinomialNB()\nlogreg = LogisticRegression()\n\n\n# ## CountVectorizer\n# To use words in a classifier, we need to convert the words to numbers. This can be done with a CountVectorizer. Sklearn's **CountVectorizer** takes all words in all tweets, assigns an ID and counts the frequency of the word per tweet. This *bag of words* can then be used as input for a classifier. It is what is called a **sparse** data set, meaning that each record will have many zeroes for the words not occurring in the tweet.\n\n\ncountvect = CountVectorizer()\n\n\n# MultinomialNB\nbest_mnb_countvect = grid_vect(\n mnb, parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=countvect)\nif not KAGGLE_ENV:\n joblib.dump(best_mnb_countvect, '../output/best_mnb_countvect.pkl')\n\n\n# LogisticRegression\nbest_logreg_countvect = grid_vect(\n logreg, parameters_logreg, X_train, X_test, parameters_text=parameters_vect, vect=countvect)\nif not KAGGLE_ENV:\n joblib.dump(best_logreg_countvect, '../output/best_logreg_countvect.pkl')\n\n\n# ## TF-IDF\n# One issue with CountVectorizer is that there might be words that occur frequently in observations of the target classes. These words do not have discriminatory information and can be removed. [TF-IDF (term frequency - inverse document frequency)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) can be used to downweight these frequent words.\n\n\ntfidfvect = TfidfVectorizer()\n\n\n# MultinomialNB\nbest_mnb_tfidf = grid_vect(mnb, parameters_mnb, X_train,\n X_test, parameters_text=parameters_vect, vect=tfidfvect)\nif not KAGGLE_ENV:\n joblib.dump(best_mnb_tfidf, '../output/best_mnb_tfidf.pkl')\n\n\n# LogisticRegression\nbest_logreg_tfidf = grid_vect(logreg, parameters_logreg, X_train,\n X_test, parameters_text=parameters_vect, vect=tfidfvect)\nif not KAGGLE_ENV:\n joblib.dump(best_logreg_tfidf, '../output/best_logreg_tfidf.pkl')\n\n\n# ## Word2Vec\n# Another way of converting the words in the tweets to numerical values can be achieved with Word2Vec. Word2Vec maps each word in a multi-dimensional space. It does this by taking into account the context in which a word appears in the tweets. As a result, words that are semantically similar are also close to each other in the multi-dimensional space.\n#\n# The Word2Vec algorithm is implemented in the [gensim](https://radimrehurek.com/gensim/models/word2vec.html) package.\n#\n# The Word2Vec algorithm uses lists of words as input. For that purpose we use the **word_tokenize** method of the the nltk package.\n\n\nSIZE = 25\n\nX_train['clean_text_wordlist'] = X_train.clean_text.apply(\n lambda x: word_tokenize(x))\nX_test['clean_text_wordlist'] = X_test.clean_text.apply(\n lambda x: word_tokenize(x))\n\nmodel = gensim.models.Word2Vec(\n X_train.clean_text_wordlist, min_count=1, size=SIZE, window=3, workers=4)\n\n\nmodel.most_similar('plane', topn=3)\n\n\n# The Word2Vec model provides a vocabulary of the words in the corpus together with their vector values. The number of vector values is equal to the chosen **size**. These are the dimensions on which each word is mapped in the multi-dimensional space.\n#\n# Words with an occurrence less than **min_count** are not kept in the vocabulary.\n# **NOTE: **A side effect of the **min_count** parameter is that some tweets could have no vector values. This is would be the case when the word(s) in the tweet occur in less than *min_count* tweets. Due to the small corpus of tweets, there is a risk of this happening in our case. Therefore we set the min_count value equal to 1.\n#\n# The tweets can have a different number of vectors, depending on the number of words it contains. To use this output for modeling we will aggregate the vectors per tweet to have the same number (i.e. *size*) of input variables per tweet. Therefore we will take the average of all vectors per tweet. We do this with the function **compute_avg_w2v_vector**. In this function we also check whether the words in the tweet occur in the vocabulary of the word2vec model. If not, a list filled with 0.0 is returned. Else the average of the word vectors.\n\n\ndef compute_avg_w2v_vector(w2v_dict, tweet):\n list_of_word_vectors = [w2v_dict[w]\n for w in tweet if w in w2v_dict.vocab.keys()]\n\n if len(list_of_word_vectors) == 0:\n result = [0.0]*SIZE\n else:\n result = np.sum(list_of_word_vectors, axis=0) / \\\n len(list_of_word_vectors)\n\n return result\n\n\nX_train_w2v = X_train['clean_text_wordlist'].apply(\n lambda x: compute_avg_w2v_vector(model.wv, x))\nX_test_w2v = X_test['clean_text_wordlist'].apply(\n lambda x: compute_avg_w2v_vector(model.wv, x))\n\n\n# This gives us a Series with a vector of dimension equal to SIZE. Now we will split this vector and create a DataFrame with each vector value in a separate column. That way we can concatenate the word2vec variables to the other TextCounts variables. We need to reuse the index of X_train and X_test respectively. Otherwise this will give issues (duplicates) in the concatenation later on.\n\n\nX_train_w2v = pd.DataFrame(X_train_w2v.values.tolist(), index=X_train.index)\nX_test_w2v = pd.DataFrame(X_test_w2v.values.tolist(), index=X_test.index)\n", "project_metadata": {"full_name": "bertcarremans/TwitterUSAirlineSentiment", "description": "Code to experiment with text mining techniques for sentiment analysis in data set is from Kaggle.", "topics": ["python", "data-science", "text-mining", "word2vec"], "git_url": "git://github.com/bertcarremans/TwitterUSAirlineSentiment.git", "stars": 13, "watchers": 13, "forks": 24, "created": "2017-12-27T10:28:30Z", "size": 6767, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 518678}, "last_updated": "2020-09-12T13:27:47Z"}, "intent": "# Concatenate with the TextCounts variables"}, {"original_comment": "# re-map the categorical variable Sex into numbers\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Cardinality\n#\n# The values of a categorical variable are selected from a group of categories, also called labels. For example, in the variable _gender_ the categories or labels are male and female, whereas in the variable _city_ the labels can be London, Manchester, Brighton and so on.\n#\n# Different categorical variables contain different number of labels or categories. The variable gender contains only 2 labels, but a variable like city or postcode, can contain a huge number of different labels.\n#\n# The number of different labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as __high cardinality__.\n#\n#\n# ### Are multiple labels in a categorical variable a problem?\n#\n# High cardinality may pose the following problems:\n#\n# - Variables with too many labels tend to dominate over those with only a few labels, particularly in **Tree based** algorithms.\n#\n# - A big number of labels within a variable may introduce noise with little, if any, information, therefore making machine learning models prone to over-fit.\n#\n# - Some of the labels may only be present in the training data set, but not in the test set, therefore machine learning algorithms may over-fit to the training set.\n#\n# - Contrarily, some labels may appear only in the test set, therefore leaving the machine learning algorithms unable to perform a calculation over the new (unseen) observation.\n#\n#\n# In particular, **tree methods can be biased towards variables with lots of labels** (variables with high cardinality). Thus, their performance may be affected by high cardinality.\n#\n# Below, I will show the effect of high cardinality of variables on the performance of different machine learning algorithms, and how a quick fix to reduce the number of labels, without any sort of data insight, already helps to boost performance.\n\n# ## In this Demo:\n#\n# We will:\n#\n# - Learn how to quantify cardinality\n# - See examples of high and low cardinality variables\n# - Understand the effect of cardinality when preparing train and test sets\n# - Visualise the effect of cardinality on Machine Learning Model performance\n#\n# We will use the Titanic dataset.\n#\n# - To download the dataset, please refer to the **Datasets** lecture in **Section 1** of the course.\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n# to build machine learning models\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\n\n# to evaluate the models\nfrom sklearn.metrics import roc_auc_score\n\n# to separate data into train and test\nfrom sklearn.model_selection import train_test_split\n\n#%%\n\n# let's load the titanic dataset\n\ndata = pd.read_csv('../titanic.csv')\ndata.head()\n\n\n# The categorical variables in this dataset are Name, Sex, Ticket, Cabin and Embarked.\n#\n# ---------------\n# **Note** that Ticket and Cabin contain both letters and numbers, so they could be treated as Mixed Variables. For this demonstration, I will treat them as categorical.\n\n#%%\n\n# let's inspect the cardinality, this is the number\n# of different labels, for the different categorical variables\n\nprint('Number of categories in the variable Name: {}'.format(\n len(data.name.unique())))\n\nprint('Number of categories in the variable Gender: {}'.format(\n len(data.sex.unique())))\n\nprint('Number of categories in the variable Ticket: {}'.format(\n len(data.ticket.unique())))\n\nprint('Number of categories in the variable Cabin: {}'.format(\n len(data.cabin.unique())))\n\nprint('Number of categories in the variable Embarked: {}'.format(\n len(data.embarked.unique())))\n\nprint('Total number of passengers in the Titanic: {}'.format(len(data)))\n\n\n# While the variable Sex contains only 2 categories and Embarked 4 (low cardinality), the variables Ticket, Name and Cabin, as expected, contain a huge number of different labels (high cardinality).\n#\n# To demonstrate the effect of high cardinality in train and test sets and machine learning performance, I will work with the variable Cabin. I will create a new variable with reduced cardinality.\n\n#%%\n\n# let's explore the values / categories of Cabin\n\n# we know from the previous cell that there are 148\n# different cabins, therefore the variable\n# is highly cardinal\n\ndata.cabin.unique()\n\n\n# Let's now reduce the cardinality of the variable. How? instead of using the entire **cabin** value, I will capture only the\n# first letter.\n#\n# ***Rationale***: the first letter indicates the deck on which the cabin was located, and is therefore an indication of both social class status and proximity to the surface of the Titanic. Both are known to improve the probability of survival.\n\n#%%\n\n# let's capture the first letter of Cabin\ndata['Cabin_reduced'] = data['cabin'].astype(str).str[0]\n\ndata[['cabin', 'Cabin_reduced']].head()\n\n#%%\n\nprint('Number of categories in the variable Cabin: {}'.format(\n len(data.cabin.unique())))\n\nprint('Number of categories in the variable Cabin reduced: {}'.format(\n len(data.Cabin_reduced.unique())))\n\n\n# We reduced the number of different labels from 182 to 9.\n\n#%%\n\n# let's separate into training and testing set\n# in order to build machine learning models\n\nuse_cols = ['cabin', 'Cabin_reduced', 'sex']\n\n# this functions comes from scikit-learn\nX_train, X_test, y_train, y_test = train_test_split(\n data[use_cols],\n data['survived'],\n test_size=0.3,\n random_state=0)\n\nX_train.shape, X_test.shape\n\n\n# ### High cardinality leads to uneven distribution of categories in train and test sets\n#\n# When a variable is highly cardinal, often some categories land only on the training set, or only on the testing set. If present only in the training set, they may lead to over-fitting. If present only on the testing set, the machine learning algorithm will not know how to handle them, as it has not seen them during training.\n\n#%%\n\n# Let's find out labels present only in the training set\n\nunique_to_train_set = [\n x for x in X_train.cabin.unique() if x not in X_test.cabin.unique()\n]\n\nlen(unique_to_train_set)\n\n\n# There are 113 Cabins only present in the training set, and not in the testing set.\n\n#%%\n\n# Let's find out labels present only in the test set\n\nunique_to_test_set = [\n x for x in X_test.cabin.unique() if x not in X_train.cabin.unique()\n]\n\nlen(unique_to_test_set)\n\n\n# Variables with high cardinality tend to have values (i.e., categories) present in the training set, that are not present in the test set, and vice versa. This will bring problems at the time of training (due to over-fitting) and scoring of new data (how should the model deal with unseen categories?).\n#\n# This problem is almost overcome by reducing the cardinality of the variable. See below.\n\n#%%\n\n# Let's find out labels present only in the training set\n# for Cabin with reduced cardinality\n\nunique_to_train_set = [\n x for x in X_train['Cabin_reduced'].unique()\n if x not in X_test['Cabin_reduced'].unique()\n]\n\nlen(unique_to_train_set)\n\n#%%\n\n# Let's find out labels present only in the test set\n# for Cabin with reduced cardinality\n\nunique_to_test_set = [\n x for x in X_test['Cabin_reduced'].unique()\n if x not in X_train['Cabin_reduced'].unique()\n]\n\nlen(unique_to_test_set)\n\n\n# Observe how by reducing the cardinality there is now only 1 label in the training set that is not present in the test set. And no label in the test set that is not contained in the training set as well.\n#\n# ### Effect of cardinality on Machine Learning Model Performance\n#\n# In order to evaluate the effect of categorical variables in machine learning models, I will quickly replace the categories by numbers. See below.\n\n#%%\n\n# Let's re-map Cabin into numbers so we can use it to train ML models\n\n# I will replace each cabin by a number\n# to quickly demonstrate the effect of\n# labels on machine learning algorithms\n\n##############\n# Note: this is neither the only nor the best\n# way to encode categorical variables into numbers\n# there is more on these techniques in the section\n# \"Encoding categorical variales\"\n##############\n\ncabin_dict = {k: i for i, k in enumerate(X_train.cabin.unique(), 0)}\ncabin_dict\n\n#%%\n\n# replace the labels in Cabin, using the dic created above\nX_train.loc[:, 'Cabin_mapped'] = X_train.loc[:, 'cabin'].map(cabin_dict)\nX_test.loc[:, 'Cabin_mapped'] = X_test.loc[:, 'cabin'].map(cabin_dict)\n\nX_train[['Cabin_mapped', 'cabin']].head(10)\n\n\n# We see how NaN takes the value 0 in the new variable, E36 takes the value 1, C68 takes the value 2, and so on.\n\n#%%\n\n# Now I will replace the letters in the reduced cabin variable\n# with the same procedure\n\n# create replace dictionary\ncabin_dict = {k: i for i, k in enumerate(X_train['Cabin_reduced'].unique(), 0)}\n\n# replace labels by numbers with dictionary\nX_train.loc[:, 'Cabin_reduced'] = X_train.loc[:, 'Cabin_reduced'].map(\n cabin_dict)\nX_test.loc[:, 'Cabin_reduced'] = X_test.loc[:, 'Cabin_reduced'].map(cabin_dict)\n\nX_train[['Cabin_reduced', 'cabin']].head(20)\n\n\n# We see now that E36 and E24 take the same number, 1, because we are capturing only the letter. They both start with E.\n\n#%%", "target_code": "X_train.loc[:, 'sex'] = X_train.loc[:, 'sex'].map({'male': 0, 'female': 1})\nX_test.loc[:, 'sex'] = X_test.loc[:, 'sex'].map({'male': 0, 'female': 1})\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Cardinality\n#\n# The values of a categorical variable are selected from a group of categories, also called labels. For example, in the variable _gender_ the categories or labels are male and female, whereas in the variable _city_ the labels can be London, Manchester, Brighton and so on.\n#\n# Different categorical variables contain different number of labels or categories. The variable gender contains only 2 labels, but a variable like city or postcode, can contain a huge number of different labels.\n#\n# The number of different labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as __high cardinality__.\n#\n#\n# ### Are multiple labels in a categorical variable a problem?\n#\n# High cardinality may pose the following problems:\n#\n# - Variables with too many labels tend to dominate over those with only a few labels, particularly in **Tree based** algorithms.\n#\n# - A big number of labels within a variable may introduce noise with little, if any, information, therefore making machine learning models prone to over-fit.\n#\n# - Some of the labels may only be present in the training data set, but not in the test set, therefore machine learning algorithms may over-fit to the training set.\n#\n# - Contrarily, some labels may appear only in the test set, therefore leaving the machine learning algorithms unable to perform a calculation over the new (unseen) observation.\n#\n#\n# In particular, **tree methods can be biased towards variables with lots of labels** (variables with high cardinality). Thus, their performance may be affected by high cardinality.\n#\n# Below, I will show the effect of high cardinality of variables on the performance of different machine learning algorithms, and how a quick fix to reduce the number of labels, without any sort of data insight, already helps to boost performance.\n\n# ## In this Demo:\n#\n# We will:\n#\n# - Learn how to quantify cardinality\n# - See examples of high and low cardinality variables\n# - Understand the effect of cardinality when preparing train and test sets\n# - Visualise the effect of cardinality on Machine Learning Model performance\n#\n# We will use the Titanic dataset.\n#\n# - To download the dataset, please refer to the **Datasets** lecture in **Section 1** of the course.\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n# to build machine learning models\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\n\n# to evaluate the models\nfrom sklearn.metrics import roc_auc_score\n\n# to separate data into train and test\nfrom sklearn.model_selection import train_test_split\n\n\n# let's load the titanic dataset\n\ndata = pd.read_csv('../titanic.csv')\ndata.head()\n\n\n# The categorical variables in this dataset are Name, Sex, Ticket, Cabin and Embarked.\n#\n# ---------------\n# **Note** that Ticket and Cabin contain both letters and numbers, so they could be treated as Mixed Variables. For this demonstration, I will treat them as categorical.\n\n\n# let's inspect the cardinality, this is the number\n# of different labels, for the different categorical variables\n\nprint('Number of categories in the variable Name: {}'.format(\n len(data.name.unique())))\n\nprint('Number of categories in the variable Gender: {}'.format(\n len(data.sex.unique())))\n\nprint('Number of categories in the variable Ticket: {}'.format(\n len(data.ticket.unique())))\n\nprint('Number of categories in the variable Cabin: {}'.format(\n len(data.cabin.unique())))\n\nprint('Number of categories in the variable Embarked: {}'.format(\n len(data.embarked.unique())))\n\nprint('Total number of passengers in the Titanic: {}'.format(len(data)))\n\n\n# While the variable Sex contains only 2 categories and Embarked 4 (low cardinality), the variables Ticket, Name and Cabin, as expected, contain a huge number of different labels (high cardinality).\n#\n# To demonstrate the effect of high cardinality in train and test sets and machine learning performance, I will work with the variable Cabin. I will create a new variable with reduced cardinality.\n\n\n# let's explore the values / categories of Cabin\n\n# we know from the previous cell that there are 148\n# different cabins, therefore the variable\n# is highly cardinal\n\ndata.cabin.unique()\n\n\n# Let's now reduce the cardinality of the variable. How? instead of using the entire **cabin** value, I will capture only the\n# first letter.\n#\n# ***Rationale***: the first letter indicates the deck on which the cabin was located, and is therefore an indication of both social class status and proximity to the surface of the Titanic. Both are known to improve the probability of survival.\n\n\n# let's capture the first letter of Cabin\ndata['Cabin_reduced'] = data['cabin'].astype(str).str[0]\n\ndata[['cabin', 'Cabin_reduced']].head()\n\n\nprint('Number of categories in the variable Cabin: {}'.format(\n len(data.cabin.unique())))\n\nprint('Number of categories in the variable Cabin reduced: {}'.format(\n len(data.Cabin_reduced.unique())))\n\n\n# We reduced the number of different labels from 182 to 9.\n\n\n# let's separate into training and testing set\n# in order to build machine learning models\n\nuse_cols = ['cabin', 'Cabin_reduced', 'sex']\n\n# this functions comes from scikit-learn\nX_train, X_test, y_train, y_test = train_test_split(\n data[use_cols],\n data['survived'],\n test_size=0.3,\n random_state=0)\n\nX_train.shape, X_test.shape\n\n\n# ### High cardinality leads to uneven distribution of categories in train and test sets\n#\n# When a variable is highly cardinal, often some categories land only on the training set, or only on the testing set. If present only in the training set, they may lead to over-fitting. If present only on the testing set, the machine learning algorithm will not know how to handle them, as it has not seen them during training.\n\n\n# Let's find out labels present only in the training set\n\nunique_to_train_set = [\n x for x in X_train.cabin.unique() if x not in X_test.cabin.unique()\n]\n\nlen(unique_to_train_set)\n\n\n# There are 113 Cabins only present in the training set, and not in the testing set.\n\n\n# Let's find out labels present only in the test set\n\nunique_to_test_set = [\n x for x in X_test.cabin.unique() if x not in X_train.cabin.unique()\n]\n\nlen(unique_to_test_set)\n\n\n# Variables with high cardinality tend to have values (i.e., categories) present in the training set, that are not present in the test set, and vice versa. This will bring problems at the time of training (due to over-fitting) and scoring of new data (how should the model deal with unseen categories?).\n#\n# This problem is almost overcome by reducing the cardinality of the variable. See below.\n\n\n# Let's find out labels present only in the training set\n# for Cabin with reduced cardinality\n\nunique_to_train_set = [\n x for x in X_train['Cabin_reduced'].unique()\n if x not in X_test['Cabin_reduced'].unique()\n]\n\nlen(unique_to_train_set)\n\n\n# Let's find out labels present only in the test set\n# for Cabin with reduced cardinality\n\nunique_to_test_set = [\n x for x in X_test['Cabin_reduced'].unique()\n if x not in X_train['Cabin_reduced'].unique()\n]\n\nlen(unique_to_test_set)\n\n\n# Observe how by reducing the cardinality there is now only 1 label in the training set that is not present in the test set. And no label in the test set that is not contained in the training set as well.\n#\n# ### Effect of cardinality on Machine Learning Model Performance\n#\n# In order to evaluate the effect of categorical variables in machine learning models, I will quickly replace the categories by numbers. See below.\n\n\n# Let's re-map Cabin into numbers so we can use it to train ML models\n\n# I will replace each cabin by a number\n# to quickly demonstrate the effect of\n# labels on machine learning algorithms\n\n##############\n# Note: this is neither the only nor the best\n# way to encode categorical variables into numbers\n# there is more on these techniques in the section\n# \"Encoding categorical variales\"\n##############\n\ncabin_dict = {k: i for i, k in enumerate(X_train.cabin.unique(), 0)}\ncabin_dict\n\n\n# replace the labels in Cabin, using the dic created above\nX_train.loc[:, 'Cabin_mapped'] = X_train.loc[:, 'cabin'].map(cabin_dict)\nX_test.loc[:, 'Cabin_mapped'] = X_test.loc[:, 'cabin'].map(cabin_dict)\n\nX_train[['Cabin_mapped', 'cabin']].head(10)\n\n\n# We see how NaN takes the value 0 in the new variable, E36 takes the value 1, C68 takes the value 2, and so on.\n\n\n# Now I will replace the letters in the reduced cabin variable\n# with the same procedure\n\n# create replace dictionary\ncabin_dict = {k: i for i, k in enumerate(X_train['Cabin_reduced'].unique(), 0)}\n\n# replace labels by numbers with dictionary\nX_train.loc[:, 'Cabin_reduced'] = X_train.loc[:, 'Cabin_reduced'].map(\n cabin_dict)\nX_test.loc[:, 'Cabin_reduced'] = X_test.loc[:, 'Cabin_reduced'].map(cabin_dict)\n\nX_train[['Cabin_reduced', 'cabin']].head(20)\n\n\n# We see now that E36 and E24 take the same number, 1, because we are capturing only the letter. They both start with E.\n\n\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "intent": "# re-map the categorical variable Sex into numbers"}, {"original_comment": "# Define the X and Y axis labels\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

\"Zeros\" in a Forced Response

\n#

MCHE 485: Mechanical Vibrations

\n#

Dr. Joshua Vaughan
\n# joshua.vaughan@louisiana.edu
\n# http://www.ucs.louisiana.edu/~jev9637/

\n\n#

\n# \t\"A
\n# Figure 1: A Four-Mass-Spring System with Excitation Force on the First Mass\n#

\n#\n# This notebook demonstrates the eigenvalue/eigenvector problem using a four-mass-spring-damper system shown in Figure 1. We'll just look at one example set of parameters. The same techniques apply for other parameters and for larger matrices.\n#\n# The equations of motion for the system are:\n#\n# $ \\quad m_1 \\ddot{x}_1 + (k_1+k_2)x_1 - k_2 x_2 = f $\n#\n# $ \\quad m_2 \\ddot{x}_2 -k_2 x_1 + (k_2 + k_3)x_2 - k_3 x_3 = 0 $\n#\n# $ \\quad m_3 \\ddot{x}_3 -k_3 x_2 + (k_3 + k_4)x_3 - k_4 x_4 = 0 $\n#\n# $ \\quad m_4 \\ddot{x}_4 -k_4 x_3 + (k_4 + k_5)x_4 = 0 $\n#\n# We could also write these equations in matrix form:\n#\n# $ \\quad \\begin{bmatrix} m_1 & 0 & 0 & 0\\\\\n# 0 & m_2 & 0 & 0\\\\\n# 0 & 0 & m_3 & 0\\\\\n# 0 & 0 & 0 & m_4\\\\ \\end{bmatrix}\\begin{bmatrix}\\ddot{x}_1 \\\\ \\ddot{x}_2\n# \\\\ \\ddot{x}_3\\\\ \\ddot{x}_4\\end{bmatrix} +\n# %\n# \\begin{bmatrix} k_1 + k_2 & -k_2 & 0 & 0 \\\\\n# -k_2 & k_2 + k_3 & -k_3 & 0 \\\\\n# 0 & -k_3 & k_3 + k_4 & -k_4 \\\\\n# 0 & 0 & -k_4 & k_4+k_5\\end{bmatrix}\\begin{bmatrix}x_1 \\\\ x_2\\\\ x_3\\\\ x_4\\end{bmatrix} = \\begin{bmatrix}f \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}$\n#\n# Define\n#\n# $ \\quad M = \\begin{bmatrix} m_1 & 0 & 0 & 0\\\\\n# 0 & m_2 & 0 & 0\\\\\n# 0 & 0 & m_3 & 0\\\\\n# 0 & 0 & 0 & m_4\\\\ \\end{bmatrix} $\n#\n# and\n#\n# $ \\quad K = \\begin{bmatrix} k_1 + k_2 & -k_2 & 0 & 0 \\\\\n# -k_2 & k_2 + k_3 & -k_3 & 0 \\\\\n# 0 & -k_3 & k_3 + k_4 & -k_4 \\\\\n# 0 & 0 & -k_4 & k_4+k_5\\end{bmatrix} $\n#\n# Using $M$ and $K$, we want to solve:\n#\n# $ \\quad \\left[K - \\omega^2 M\\right]\\bar{X} = 0 $\n#\n# for $\\bar{X}$. This is an eigenvalue problem.\n#\n# For information on how to obtain these equations, you can see the lectures at the [class website](http://www.ucs.louisiana.edu/~jev9637/MCHE485.html).\n#\n# We'll use the [Scipy version of the linear algebra module](http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.linalg.eigh.html). It allows us to solve the \"general\" eignevalue problem.\n\n#%%\n\nfrom scipy.integrate import odeint\nimport urllib.request\nfrom IPython.core.display import HTML\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n#%%\n\n# We want our plots to be displayed inline, not in a separate window\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Import the plotting functions\n# Note: Using the 'from module import *' notation is usually a bad idea.\n\n#%%\n\n# Let's also improve the printing of NumPy arrays.\nnp.set_printoptions(precision=3, suppress=True)\n\n\n# To see how to solve this eigenvalue problem, we will use some example parameters, set up below. All the spring constants are equal and the masses are equal.\n\n#%%\n\n# Define the matrices\nm1 = 1.0\nm2 = 1.0\nm3 = 1.0\nm4 = 1.0\n\nk1 = 4.0\nk2 = 4.0\nk3 = 4.0\nk4 = 4.0\nk5 = 4.0\n\nM = np.array([[m1, 0, 0, 0],\n [0, m2, 0, 0],\n [0, 0, m3, 0],\n [0, 0, 0, m4]])\n\nK = np.array([[k1 + k2, -k2, 0, 0],\n [-k2, k2 + k3, -k3, 0],\n [0, -k3, k3 + k4, -k4],\n [0, 0, -k4, k4+k5]])\n\n#%%\n\n# We'll use the scipy version of the linear algebra\n\neigenvals, eigenvects = linalg.eigh(K, M)\n\n\n#\n# The linalg.eigh function returns two arrays, one of the eigenvalues and one of the eigenvectors. The eigenvalues are the square of the two natural frequencies. The eigenvectors are returned in normalized form, with each ''row'' of the array representing an eigenvector.\n#\n\n#%%\n\nprint('\\n')\nprint('The resulting eigenalues are {:.2f}, {:.2f}, {:.2f}, and {:.2f}.'.format(\n eigenvals[0], eigenvals[1], eigenvals[2], eigenvals[3]))\nprint('\\n')\nprint('So the natrual frequencies are {:.2f}rad/s, {:.2f}rad/s, {:.2f}rad/s, and {:.2f}rad/s.'.format(\n np.sqrt(eigenvals[0]), np.sqrt(eigenvals[1]), np.sqrt(eigenvals[2]), np.sqrt(eigenvals[3])))\nprint('\\n')\n\n#%%\n\nprint('\\n')\nprint('The first eigenvector is ' + str(eigenvects[:, 0]) + '.')\nprint('\\n')\nprint('The second eigenvector is ' + str(eigenvects[:, 1]) + '.')\nprint('\\n')\nprint('The third eigenvector is ' + str(eigenvects[:, 2]) + '.')\nprint('\\n')\nprint('The fourth eigenvector is ' + str(eigenvects[:, 3]) + '.')\nprint('\\n')\n\n\n# # Responses\n# Now, let's look at the response and see how it reflects the four modes of the system.\n\n#%%\n\n# Define the equations of motion\n\n# Define the system as a series of 1st order ODEs (beginnings of state-space form)\ndef eq_of_motion(w, t, p):\n \"\"\"\n Defines the differential equations for the coupled spring-mass system.\n\n Arguments:\n w : vector of the state variables:\n w = [x1, x1_dot, x2, x2_dot, x3, x3_dot, x4, x4_dot]\n t : time\n p : vector of the parameters:\n p = [m1, m2, m3, m4, k1, k2, k3, k4, k5]\n \"\"\"\n x1, x1_dot, x2, x2_dot, x3, x3_dot, x4, x4_dot = w\n m1, m2, m3, m4, k1, k2, k3, k4, k5 = p\n\n # Create sysODE = (x', x_dot'): - Here, we're assuming f(t) = 0\n sysODE = [x1_dot,\n (-(k1+k2)*x1 + k2*x2) / m1,\n x2_dot,\n (k2*x1 - (k2+k3)*x2 + k3*x3) / m2,\n x3_dot,\n (k3*x2 - (k3+k4)*x3 + k4*x4) / m3,\n x4_dot,\n (k4*x3 - (k4+k5)*x4) / m4]\n return sysODE\n\n#%%\n\n# Import the ODE solver\n\n# Set up simulation parameters\n\n# ODE solver parameters\nabserr = 1.0e-9\nrelerr = 1.0e-9\nmax_step = 0.01\nstoptime = 10.0\nnumpoints = 10001\n\n# Create the time samples for the output of the ODE solver\nt = np.linspace(0.0, stoptime, numpoints)\n\n\n# ## Example Free Vibration\n# Let's start by looking at some free vibration. For this set of parameters. In the code below, we choose initial conditions:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# $ \\quad x_3(0) = x_4(0) = 0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = \\dot{x}_3(0) = \\dot{x}_4(0) = 0$\n\n#%%\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = 0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\nx3_init = 0.0\nx3_dot_init = 0.0\nx4_init = 0.0\nx4_dot_init = 0.0\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, m3, m4, k1, k2, k3, k4, k5]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init,\n x3_init, x3_dot_init, x4_init, x4_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n#%%\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\nplt.plot(t, resp[:, 4], linewidth=2, linestyle=\"-.\", label=r'$x_3$')\nplt.plot(t, resp[:, 6], linewidth=2, linestyle=\":\", label=r'$x_4$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', ncol=2, fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_1.pdf')\n\n# Resize the figure for better display in the notebook\nfig.set_size_inches(9, 6)\n\n\n# ## Frequency Response \u2013 Force on $m_1$\n# Now, let's look at the frequency response of this system. It will tell us how many frequencies there can be zero amplitude response for each mass.\n\n#%%\n\nF1 = 1.0\nF2 = 0.0\nF3 = 0.0\nF4 = 0.0\n\nF = [F1, F2, F3, F4]\n\nw = np.linspace(0, 6, 1800)\nX = np.zeros((len(w), 4))\n\n# This is (K-w^2 M)^-1 * F\nfor ii, freq in enumerate(w):\n X[ii, :] = np.dot(linalg.inv(K - freq**2 * M), F)\n\n# Let's mask the discontinuity, so it isn't plotted\npos = np.where(np.abs(X[:, 0]) >= 15)\nX[pos, :] = np.nan\nw[pos] = np.nan\n\n#%%\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to CMU Serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\n\nplt.subplot(2, 2, 1)\nplt.plot(w, X[:, 0], linewidth=2, label=r'$\\bar{x}_1$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_1$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n\nplt.subplot(2, 2, 2)\nplt.plot(w, X[:, 1], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_2$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_2$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 3)\nplt.plot(w, X[:, 2], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_3$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_3$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 4)\nplt.plot(w, X[:, 3], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_4$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_4$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# # Create the legend, then fix the fontsize\n# leg = plt.legend(loc='upper right', fancybox=True)\n# ltext = leg.get_texts()\n# plt.setp(ltext,family='serif',fontsize=16)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5, w_pad=3.0, h_pad=2.0)\n\n\n# save the figure as a high-res pdf in the current folder\n# plt.savefig('Spring_Pendulum_Example_Amp.pdf')\n\n# fig.set_size_inches(9,6) # Resize the figure for better display in the notebook\n\n\n# ## Frequency Response \u2013 Force on $m_2$\n# All we need to change to examine the case in Figure 2, which has the force input on the second mass, is the $F$ matrix we defined above. Then, a replot of the frequency repsonses will show at what (and how many) frequencies each mass has a zero amplitude response.\n#\n#

\n# \t\"A
\n# Figure 2: A Four-Mass-Spring System with Excitation Force on the Second Mass\n#

\n\n#%%\n\nF1 = 0.0\nF2 = 1.0\nF3 = 0.0\nF4 = 0.0\n\nF = [F1, F2, F3, F4]\n\nw = np.linspace(0, 6, 1200)\nX = np.zeros((len(w), 4))\n\n# This is (K-w^2 M)^-1 * F\nfor ii, freq in enumerate(w):\n X[ii, :] = np.dot(linalg.inv(K - freq**2 * M), F)\n\n# Let's mask the discontinuity, so it isn't plotted\npos = np.where(np.abs(X[:, 0]) >= 15)\nX[pos, :] = np.nan\nw[pos] = np.nan\n\n#%%\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to CMU Serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\n\nplt.subplot(2, 2, 1)\nplt.plot(w, X[:, 0], linewidth=2, label=r'$\\bar{x}_1$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_1$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-2, 2)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n\nplt.subplot(2, 2, 2)\nplt.plot(w, X[:, 1], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_2$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_2$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-2, 2)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 3)\nplt.plot(w, X[:, 2], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_3$')", "target_code": "plt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_3$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

\"Zeros\" in a Forced Response

\n#

MCHE 485: Mechanical Vibrations

\n#

Dr. Joshua Vaughan
\n# joshua.vaughan@louisiana.edu
\n# http://www.ucs.louisiana.edu/~jev9637/

\n\n#

\n# \t\"A
\n# Figure 1: A Four-Mass-Spring System with Excitation Force on the First Mass\n#

\n#\n# This notebook demonstrates the eigenvalue/eigenvector problem using a four-mass-spring-damper system shown in Figure 1. We'll just look at one example set of parameters. The same techniques apply for other parameters and for larger matrices.\n#\n# The equations of motion for the system are:\n#\n# $ \\quad m_1 \\ddot{x}_1 + (k_1+k_2)x_1 - k_2 x_2 = f $\n#\n# $ \\quad m_2 \\ddot{x}_2 -k_2 x_1 + (k_2 + k_3)x_2 - k_3 x_3 = 0 $\n#\n# $ \\quad m_3 \\ddot{x}_3 -k_3 x_2 + (k_3 + k_4)x_3 - k_4 x_4 = 0 $\n#\n# $ \\quad m_4 \\ddot{x}_4 -k_4 x_3 + (k_4 + k_5)x_4 = 0 $\n#\n# We could also write these equations in matrix form:\n#\n# $ \\quad \\begin{bmatrix} m_1 & 0 & 0 & 0\\\\\n# 0 & m_2 & 0 & 0\\\\\n# 0 & 0 & m_3 & 0\\\\\n# 0 & 0 & 0 & m_4\\\\ \\end{bmatrix}\\begin{bmatrix}\\ddot{x}_1 \\\\ \\ddot{x}_2\n# \\\\ \\ddot{x}_3\\\\ \\ddot{x}_4\\end{bmatrix} +\n# %\n# \\begin{bmatrix} k_1 + k_2 & -k_2 & 0 & 0 \\\\\n# -k_2 & k_2 + k_3 & -k_3 & 0 \\\\\n# 0 & -k_3 & k_3 + k_4 & -k_4 \\\\\n# 0 & 0 & -k_4 & k_4+k_5\\end{bmatrix}\\begin{bmatrix}x_1 \\\\ x_2\\\\ x_3\\\\ x_4\\end{bmatrix} = \\begin{bmatrix}f \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}$\n#\n# Define\n#\n# $ \\quad M = \\begin{bmatrix} m_1 & 0 & 0 & 0\\\\\n# 0 & m_2 & 0 & 0\\\\\n# 0 & 0 & m_3 & 0\\\\\n# 0 & 0 & 0 & m_4\\\\ \\end{bmatrix} $\n#\n# and\n#\n# $ \\quad K = \\begin{bmatrix} k_1 + k_2 & -k_2 & 0 & 0 \\\\\n# -k_2 & k_2 + k_3 & -k_3 & 0 \\\\\n# 0 & -k_3 & k_3 + k_4 & -k_4 \\\\\n# 0 & 0 & -k_4 & k_4+k_5\\end{bmatrix} $\n#\n# Using $M$ and $K$, we want to solve:\n#\n# $ \\quad \\left[K - \\omega^2 M\\right]\\bar{X} = 0 $\n#\n# for $\\bar{X}$. This is an eigenvalue problem.\n#\n# For information on how to obtain these equations, you can see the lectures at the [class website](http://www.ucs.louisiana.edu/~jev9637/MCHE485.html).\n#\n# We'll use the [Scipy version of the linear algebra module](http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.linalg.eigh.html). It allows us to solve the \"general\" eignevalue problem.\n\n\nfrom scipy.integrate import odeint\nimport urllib.request\nfrom IPython.core.display import HTML\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n\n# We want our plots to be displayed inline, not in a separate window\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Import the plotting functions\n# Note: Using the 'from module import *' notation is usually a bad idea.\n\n\n# Let's also improve the printing of NumPy arrays.\nnp.set_printoptions(precision=3, suppress=True)\n\n\n# To see how to solve this eigenvalue problem, we will use some example parameters, set up below. All the spring constants are equal and the masses are equal.\n\n\n# Define the matrices\nm1 = 1.0\nm2 = 1.0\nm3 = 1.0\nm4 = 1.0\n\nk1 = 4.0\nk2 = 4.0\nk3 = 4.0\nk4 = 4.0\nk5 = 4.0\n\nM = np.array([[m1, 0, 0, 0],\n [0, m2, 0, 0],\n [0, 0, m3, 0],\n [0, 0, 0, m4]])\n\nK = np.array([[k1 + k2, -k2, 0, 0],\n [-k2, k2 + k3, -k3, 0],\n [0, -k3, k3 + k4, -k4],\n [0, 0, -k4, k4+k5]])\n\n\n# We'll use the scipy version of the linear algebra\n\neigenvals, eigenvects = linalg.eigh(K, M)\n\n\n#\n# The linalg.eigh function returns two arrays, one of the eigenvalues and one of the eigenvectors. The eigenvalues are the square of the two natural frequencies. The eigenvectors are returned in normalized form, with each ''row'' of the array representing an eigenvector.\n#\n\n\nprint('\\n')\nprint('The resulting eigenalues are {:.2f}, {:.2f}, {:.2f}, and {:.2f}.'.format(\n eigenvals[0], eigenvals[1], eigenvals[2], eigenvals[3]))\nprint('\\n')\nprint('So the natrual frequencies are {:.2f}rad/s, {:.2f}rad/s, {:.2f}rad/s, and {:.2f}rad/s.'.format(\n np.sqrt(eigenvals[0]), np.sqrt(eigenvals[1]), np.sqrt(eigenvals[2]), np.sqrt(eigenvals[3])))\nprint('\\n')\n\n\nprint('\\n')\nprint('The first eigenvector is ' + str(eigenvects[:, 0]) + '.')\nprint('\\n')\nprint('The second eigenvector is ' + str(eigenvects[:, 1]) + '.')\nprint('\\n')\nprint('The third eigenvector is ' + str(eigenvects[:, 2]) + '.')\nprint('\\n')\nprint('The fourth eigenvector is ' + str(eigenvects[:, 3]) + '.')\nprint('\\n')\n\n\n# # Responses\n# Now, let's look at the response and see how it reflects the four modes of the system.\n\n\n# Define the equations of motion\n\n# Define the system as a series of 1st order ODEs (beginnings of state-space form)\ndef eq_of_motion(w, t, p):\n \"\"\"\n Defines the differential equations for the coupled spring-mass system.\n\n Arguments:\n w : vector of the state variables:\n w = [x1, x1_dot, x2, x2_dot, x3, x3_dot, x4, x4_dot]\n t : time\n p : vector of the parameters:\n p = [m1, m2, m3, m4, k1, k2, k3, k4, k5]\n \"\"\"\n x1, x1_dot, x2, x2_dot, x3, x3_dot, x4, x4_dot = w\n m1, m2, m3, m4, k1, k2, k3, k4, k5 = p\n\n # Create sysODE = (x', x_dot'): - Here, we're assuming f(t) = 0\n sysODE = [x1_dot,\n (-(k1+k2)*x1 + k2*x2) / m1,\n x2_dot,\n (k2*x1 - (k2+k3)*x2 + k3*x3) / m2,\n x3_dot,\n (k3*x2 - (k3+k4)*x3 + k4*x4) / m3,\n x4_dot,\n (k4*x3 - (k4+k5)*x4) / m4]\n return sysODE\n\n\n# Import the ODE solver\n\n# Set up simulation parameters\n\n# ODE solver parameters\nabserr = 1.0e-9\nrelerr = 1.0e-9\nmax_step = 0.01\nstoptime = 10.0\nnumpoints = 10001\n\n# Create the time samples for the output of the ODE solver\nt = np.linspace(0.0, stoptime, numpoints)\n\n\n# ## Example Free Vibration\n# Let's start by looking at some free vibration. For this set of parameters. In the code below, we choose initial conditions:\n#\n# $ \\quad x_1(0) = x_2(0) = x_0$\n#\n# $ \\quad x_3(0) = x_4(0) = 0$\n#\n# and\n#\n# $ \\quad \\dot{x}_1(0) = \\dot{x}_2(0) = \\dot{x}_3(0) = \\dot{x}_4(0) = 0$\n\n\n# Initial conditions\nx1_init = 0.5 # initial x1 position\nx1_dot_init = 0.0 # initial x1 velocity\nx2_init = 0.5 # initial x2 position\nx2_dot_init = 0.0 # initial x2 velocity\nx3_init = 0.0\nx3_dot_init = 0.0\nx4_init = 0.0\nx4_dot_init = 0.0\n\n# Pack the parameters and initial conditions into arrays\np = [m1, m2, m3, m4, k1, k2, k3, k4, k5]\nx0 = [x1_init, x1_dot_init, x2_init, x2_dot_init,\n x3_init, x3_dot_init, x4_init, x4_dot_init]\n\n# Call the ODE solver.\nresp = odeint(eq_of_motion, x0, t, args=(\n p,), atol=abserr, rtol=relerr, hmax=max_step)\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(6, 4))\nax = plt.gca()\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\n\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# Turn on the plot grid and set appropriate linestyle and color\nax.grid(True, linestyle=':', color='0.75')\nax.set_axisbelow(True)\n\n# Define the X and Y axis labels\nplt.xlabel('Time (s)', family='serif', fontsize=22, weight='bold', labelpad=5)\nplt.ylabel('Position (m)', family='serif',\n fontsize=22, weight='bold', labelpad=10)\n\nplt.plot(t, resp[:, 0], linewidth=2, label=r'$x_1$')\nplt.plot(t, resp[:, 2], linewidth=2, linestyle=\"--\", label=r'$x_2$')\nplt.plot(t, resp[:, 4], linewidth=2, linestyle=\"-.\", label=r'$x_3$')\nplt.plot(t, resp[:, 6], linewidth=2, linestyle=\":\", label=r'$x_4$')\n\n# uncomment below and set limits if needed\n# plt.xlim(0,5)\nplt.ylim(-1, 1.35)\nplt.yticks([-0.5, 0, 0.5, 1.0], ['$-x_0$', '$0$', '$x_0$', '$2x_0$'])\n\n# Create the legend, then fix the fontsize\nleg = plt.legend(loc='upper right', ncol=2, fancybox=True)\nltext = leg.get_texts()\nplt.setp(ltext, family='serif', fontsize=18)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5)\n\n# save the figure as a high-res pdf in the current folder\n# It's saved at the original 6x4 size\n# plt.savefig('FreeVibration_mode_1.pdf')\n\n# Resize the figure for better display in the notebook\nfig.set_size_inches(9, 6)\n\n\n# ## Frequency Response \u2013 Force on $m_1$\n# Now, let's look at the frequency response of this system. It will tell us how many frequencies there can be zero amplitude response for each mass.\n\n\nF1 = 1.0\nF2 = 0.0\nF3 = 0.0\nF4 = 0.0\n\nF = [F1, F2, F3, F4]\n\nw = np.linspace(0, 6, 1800)\nX = np.zeros((len(w), 4))\n\n# This is (K-w^2 M)^-1 * F\nfor ii, freq in enumerate(w):\n X[ii, :] = np.dot(linalg.inv(K - freq**2 * M), F)\n\n# Let's mask the discontinuity, so it isn't plotted\npos = np.where(np.abs(X[:, 0]) >= 15)\nX[pos, :] = np.nan\nw[pos] = np.nan\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to CMU Serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\n\nplt.subplot(2, 2, 1)\nplt.plot(w, X[:, 0], linewidth=2, label=r'$\\bar{x}_1$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_1$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n\nplt.subplot(2, 2, 2)\nplt.plot(w, X[:, 1], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_2$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_2$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 3)\nplt.plot(w, X[:, 2], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_3$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_3$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 4)\nplt.plot(w, X[:, 3], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_4$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_4$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-4, 4)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n# # Create the legend, then fix the fontsize\n# leg = plt.legend(loc='upper right', fancybox=True)\n# ltext = leg.get_texts()\n# plt.setp(ltext,family='serif',fontsize=16)\n\n# Adjust the page layout filling the page using the new tight_layout command\nplt.tight_layout(pad=0.5, w_pad=3.0, h_pad=2.0)\n\n\n# save the figure as a high-res pdf in the current folder\n# plt.savefig('Spring_Pendulum_Example_Amp.pdf')\n\n# fig.set_size_inches(9,6) # Resize the figure for better display in the notebook\n\n\n# ## Frequency Response \u2013 Force on $m_2$\n# All we need to change to examine the case in Figure 2, which has the force input on the second mass, is the $F$ matrix we defined above. Then, a replot of the frequency repsonses will show at what (and how many) frequencies each mass has a zero amplitude response.\n#\n#

\n# \t\"A
\n# Figure 2: A Four-Mass-Spring System with Excitation Force on the Second Mass\n#

\n\n\nF1 = 0.0\nF2 = 1.0\nF3 = 0.0\nF4 = 0.0\n\nF = [F1, F2, F3, F4]\n\nw = np.linspace(0, 6, 1200)\nX = np.zeros((len(w), 4))\n\n# This is (K-w^2 M)^-1 * F\nfor ii, freq in enumerate(w):\n X[ii, :] = np.dot(linalg.inv(K - freq**2 * M), F)\n\n# Let's mask the discontinuity, so it isn't plotted\npos = np.where(np.abs(X[:, 0]) >= 15)\nX[pos, :] = np.nan\nw[pos] = np.nan\n\n\n# Set the plot size - 3x2 aspect ratio is best\nfig = plt.figure(figsize=(12, 8))\n\nplt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)\n\n# Change the axis units to CMU Serif\nplt.setp(ax.get_ymajorticklabels(), family='serif', fontsize=18)\nplt.setp(ax.get_xmajorticklabels(), family='serif', fontsize=18)\n\n\nplt.subplot(2, 2, 1)\nplt.plot(w, X[:, 0], linewidth=2, label=r'$\\bar{x}_1$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_1$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-2, 2)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\n\nplt.subplot(2, 2, 2)\nplt.plot(w, X[:, 1], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_2$')\n# Define the X and Y axis labels\nplt.xlabel('Frequency (rad/s)', family='serif',\n fontsize=22, weight='bold', labelpad=5)\nplt.ylabel(r'$\\bar{x}_2$', family='serif',\n fontsize=22, weight='bold', labelpad=10)\nplt.ylim(-2, 2)\nax = plt.gca()\nax.spines['right'].set_color('none')\nax.spines['top'].set_color('none')\nax.xaxis.set_ticks_position('bottom')\nax.yaxis.set_ticks_position('left')\n\nplt.subplot(2, 2, 3)\nplt.plot(w, X[:, 2], linewidth=2, linestyle=\"-\", label=r'$\\bar{x}_3$')\n", "project_metadata": {"full_name": "DocVaughan/MCHE485---Mechanical-Vibrations", "description": "Code supporting MCHE485: Mechanical Vibrations at the Univsersity of Louisiana at Lafayette", "topics": ["mechanical-vibrations", "fft", "jupyter-notebook", "python", "education"], "git_url": "git://github.com/DocVaughan/MCHE485---Mechanical-Vibrations.git", "stars": 21, "watchers": 21, "forks": 18, "created": "2015-01-06T00:10:33Z", "size": 43628, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4005438, "TeX": 10648, "CSS": 2772}, "last_updated": "2020-12-23T16:48:18Z"}, "intent": "# Define the X and Y axis labels"}, {"original_comment": "# view classes\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Table of Contents

\n# \n\n# # Overview\n\n# ## Notebook Summary\n\n# This notebook walks through the steps to build and train the multi-class image classification algorithm used on\n# __[wittytourist.com](https://wittytourist.com)__\n#\n# It is a convolutional neural net (CNN) based on the resnet50 architecture (has 50 layers) and you can learn the details in the __[resnet paper](https://arxiv.org/pdf/1512.03385.pdf)__. The model starts with pre-trained weights from ImageNet and with transfer learning, it is fine-tuned to fit the image classes in this notebook.\n\n# ## CNN - Convolutional Neural Nets Overview\n\n# To understand the underlying math and intuition behind CNNs, check out this step-by-step __[blog post](https://towardsdatascience.com/cutting-edge-face-recognition-is-complicated-these-spreadsheets-make-it-easier-e7864dbf0e1a)__ I built using Excel.\n\n# ![Image of CNN in Excel](https://cdn-images-1.medium.com/max/2000/1*m65nIVO62a4Dua2QzmPI2A.png)\n\n# ![Image of CNN in Excel](https://cdn-images-1.medium.com/max/2000/1*JrxHmdQH4HFNj4aBtuuEpQ.png)\n\n# ## Import Libraries\n\n# Import library dependencies and note that the fastai library already has most dependencies needed (numpy, pandas, etc.)\n\n#%%\n\nfrom fastai import *\nfrom fastai.vision import *\nimport zipfile\nfrom fastai.widgets import *\n\n#%%\n\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Data Collection & Data Exploration\n\n# ## Data Collection\n\n# There are 2 steps to make your own training set of images:\n#\n# 1. Download images\n# 2. Prune images (i.e. remove mislabeled images, images not representative of your use case, etc.)\n\n# ### Download Images for Your Dataset\n\n# There are several ways you can download the images to build your dataset. Here are a few resources you can check out:\n#\n# 1. __Google__\n# 1. [Google images - fast.ai script](https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson2-download.ipynb)\n# 2. [Google images - another download script](https://github.com/hardikvasa/google-images-download)\n#\n#\n# 2. __Instagram__\n# 1. [Instagram API](https://www.instagram.com/developer/)\n# 2. [Instalooter](https://github.com/althonos/instaLooter)\n#\n# To ensure my images were representative of the typical use case (taking & sharing a photo on social media), I trained the model with images downloaded from Instagram.\n\n# ### Prune Images\n\n# Getting a large and clean dataset of images is the most time-consuming (but also most important) part of building an accurate image classification model. After downloading over 5,000 images, I used my photo management software (Adobe Lightroom) to go through each image to accept or reject it.\n#\n# After hours of pruning images, I was left with:\n# - ~5,000 training/validation images, ~1.1 GB, and ~1,000 test images\n# - 13 classes of images\n# - ~300 images per image class\n#\n# I created 4 .zip files to be imported and used for training & validating the model. You could create 1 .zip file with all images, but I had issues with the upload being interrupted so I opted to create 4 upload files.\n\n# ## Import images\n\n# In the Jupyter notebook, start by uploading zipped file(s) which contains sub-folders for each class of images. I stored files in the path:\n#\n# - /home/jupyter/projects/05-witty-tourist/images/train/\n#\n# Each sub-folder is a separate class that contains images for that class:\n#\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-1\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-2\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-3\n# - ...\n#\n# After uploading the zipped file(s), unzip the file(s)\n\n#%%\n\n# only run this cell once\nsf_images_zip = zipfile.ZipFile(\n '/home/jupyter/projects/05-witty-tourist/images/train/sf-images.zip')\nsf_images.extractall('/home/jupyter/projects/05-witty-tourist/images/train')\nsf_images.close()\n\n\n# Set the path to your project.\n\n#%%\n\npath = Path('/home/jupyter/projects/05-witty-tourist/')\n\n\n# Set the classes of the images you want to classify and validate each file is an acceptable image format.\n#\n# I use 13 of the San Franciso Bay area's most iconic landmarks/sights to see.\n\n#%%\n\n# create classes you want to train your network to detect\nclasses = ['the Golden Gate Bridge',\n 'the Oakland Bay Bridge',\n 'a cable car',\n 'Lombard Street',\n 'Alcatraz',\n 'the Painted Ladies at Alamo Square',\n 'the Palace of Fine Arts',\n 'the sea lions at Pier 39',\n 'the Transamerica Pyramid',\n 'Muir Woods',\n 'Coit Tower',\n 'Fisherman\\'s Wharf',\n 'Ghirardelli Square'\n ]\n\n#%%\n\n# set path to training images location\ntrain_path = Path('/home/jupyter/projects/05-witty-tourist/images/train')\n\n# verify each image is an acceptable image file\nfor c in classes:\n print(c)\n verify_images(train_path/c, delete=True, max_workers=8)\n\n\n# ## Data Exploration & Visualization\n\n# Now, let's take a look at some of our pictures.\n\n# ### Store data in databunch & view images\n\n#%%\n\nnp.random.seed(42)\ndata = ImageDataBunch.from_folder(train_path, train=\".\", valid_pct=0.20,\n ds_tfms=get_transforms(), size=128,\n num_workers=4).normalize(imagenet_stats)\n\n\n# Create an empty folder called 'models' and view files stored there\n\n#%%\n\n# only run this cell once\nget_ipython().run_line_magic(\n 'mkdir', '\"/home/jupyter/projects/05-witty-tourist/models\"')\n\n#%%\n\n# set path to your models\nmodel_path = Path('/home/jupyter/projects/05-witty-tourist/models')\n\n# view current files stored in models folder\nmodel_path.ls()\n\n#%%", "target_code": "data.classes\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Table of Contents

\n# \n\n# # Overview\n\n# ## Notebook Summary\n\n# This notebook walks through the steps to build and train the multi-class image classification algorithm used on\n# __[wittytourist.com](https://wittytourist.com)__\n#\n# It is a convolutional neural net (CNN) based on the resnet50 architecture (has 50 layers) and you can learn the details in the __[resnet paper](https://arxiv.org/pdf/1512.03385.pdf)__. The model starts with pre-trained weights from ImageNet and with transfer learning, it is fine-tuned to fit the image classes in this notebook.\n\n# ## CNN - Convolutional Neural Nets Overview\n\n# To understand the underlying math and intuition behind CNNs, check out this step-by-step __[blog post](https://towardsdatascience.com/cutting-edge-face-recognition-is-complicated-these-spreadsheets-make-it-easier-e7864dbf0e1a)__ I built using Excel.\n\n# ![Image of CNN in Excel](https://cdn-images-1.medium.com/max/2000/1*m65nIVO62a4Dua2QzmPI2A.png)\n\n# ![Image of CNN in Excel](https://cdn-images-1.medium.com/max/2000/1*JrxHmdQH4HFNj4aBtuuEpQ.png)\n\n# ## Import Libraries\n\n# Import library dependencies and note that the fastai library already has most dependencies needed (numpy, pandas, etc.)\n\n\nfrom fastai import *\nfrom fastai.vision import *\nimport zipfile\nfrom fastai.widgets import *\n\n\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Data Collection & Data Exploration\n\n# ## Data Collection\n\n# There are 2 steps to make your own training set of images:\n#\n# 1. Download images\n# 2. Prune images (i.e. remove mislabeled images, images not representative of your use case, etc.)\n\n# ### Download Images for Your Dataset\n\n# There are several ways you can download the images to build your dataset. Here are a few resources you can check out:\n#\n# 1. __Google__\n# 1. [Google images - fast.ai script](https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson2-download.ipynb)\n# 2. [Google images - another download script](https://github.com/hardikvasa/google-images-download)\n#\n#\n# 2. __Instagram__\n# 1. [Instagram API](https://www.instagram.com/developer/)\n# 2. [Instalooter](https://github.com/althonos/instaLooter)\n#\n# To ensure my images were representative of the typical use case (taking & sharing a photo on social media), I trained the model with images downloaded from Instagram.\n\n# ### Prune Images\n\n# Getting a large and clean dataset of images is the most time-consuming (but also most important) part of building an accurate image classification model. After downloading over 5,000 images, I used my photo management software (Adobe Lightroom) to go through each image to accept or reject it.\n#\n# After hours of pruning images, I was left with:\n# - ~5,000 training/validation images, ~1.1 GB, and ~1,000 test images\n# - 13 classes of images\n# - ~300 images per image class\n#\n# I created 4 .zip files to be imported and used for training & validating the model. You could create 1 .zip file with all images, but I had issues with the upload being interrupted so I opted to create 4 upload files.\n\n# ## Import images\n\n# In the Jupyter notebook, start by uploading zipped file(s) which contains sub-folders for each class of images. I stored files in the path:\n#\n# - /home/jupyter/projects/05-witty-tourist/images/train/\n#\n# Each sub-folder is a separate class that contains images for that class:\n#\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-1\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-2\n# - /home/jupyter/projects/05-witty-tourist/images/train/class-3\n# - ...\n#\n# After uploading the zipped file(s), unzip the file(s)\n\n\n# only run this cell once\nsf_images_zip = zipfile.ZipFile(\n '/home/jupyter/projects/05-witty-tourist/images/train/sf-images.zip')\nsf_images.extractall('/home/jupyter/projects/05-witty-tourist/images/train')\nsf_images.close()\n\n\n# Set the path to your project.\n\n\npath = Path('/home/jupyter/projects/05-witty-tourist/')\n\n\n# Set the classes of the images you want to classify and validate each file is an acceptable image format.\n#\n# I use 13 of the San Franciso Bay area's most iconic landmarks/sights to see.\n\n\n# create classes you want to train your network to detect\nclasses = ['the Golden Gate Bridge',\n 'the Oakland Bay Bridge',\n 'a cable car',\n 'Lombard Street',\n 'Alcatraz',\n 'the Painted Ladies at Alamo Square',\n 'the Palace of Fine Arts',\n 'the sea lions at Pier 39',\n 'the Transamerica Pyramid',\n 'Muir Woods',\n 'Coit Tower',\n 'Fisherman\\'s Wharf',\n 'Ghirardelli Square'\n ]\n\n\n# set path to training images location\ntrain_path = Path('/home/jupyter/projects/05-witty-tourist/images/train')\n\n# verify each image is an acceptable image file\nfor c in classes:\n print(c)\n verify_images(train_path/c, delete=True, max_workers=8)\n\n\n# ## Data Exploration & Visualization\n\n# Now, let's take a look at some of our pictures.\n\n# ### Store data in databunch & view images\n\n\nnp.random.seed(42)\ndata = ImageDataBunch.from_folder(train_path, train=\".\", valid_pct=0.20,\n ds_tfms=get_transforms(), size=128,\n num_workers=4).normalize(imagenet_stats)\n\n\n# Create an empty folder called 'models' and view files stored there\n\n\n# only run this cell once\nget_ipython().run_line_magic(\n 'mkdir', '\"/home/jupyter/projects/05-witty-tourist/models\"')\n\n\n# set path to your models\nmodel_path = Path('/home/jupyter/projects/05-witty-tourist/models')\n\n# view current files stored in models folder\nmodel_path.ls()\n\n", "project_metadata": {"full_name": "DaveSmith227/witty-tourist", "description": "Computer vision web application that detects famous landmarks in images and returns a witty caption.", "topics": [], "git_url": "git://github.com/DaveSmith227/witty-tourist.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2019-03-03T00:08:08Z", "size": 5395, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5505645, "Python": 6152, "HTML": 3709, "CSS": 2785, "Dockerfile": 250}, "last_updated": "2020-06-07T07:44:07Z"}, "intent": "# view classes"}, {"original_comment": "# Plot the results from the above.\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DCR AGN Metric Analysis\n#\n# G. Richards, B. Martin, W. Yu, C. Peters (August 2020)\n\n# This notebook takes simulated quasar colors and DCR slopes. It takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from all the observations is compared to the \"known\" slope.\n#\n# This needs a fair bit of cleaning up and documenting. It is also unnecessarily slow. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n#%%\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ### open files with redshift bins, colors, and DCR slopes\n#\n# For historical reasons the redshift bins and the colors and DCR slopes are stored in separate data files. These next few cells just merges them together into a single Pandas DataFrame.\n\n#%%\n\n# Table of redshift bins where we have computed the mean colors and DCR slopes.\nzfile = 'fittingS82_zshifts.dat'\ndfZ = pd.read_csv(zfile)\n# dfZ.reset_index(drop=True)\ndfZ.head()\n\n#%%\n\n# Table of colors and DCR slopes for the above redshifts\ndatafile = 'fittingS82_zshiftfit.dat'\ndfData = pd.read_csv(datafile, index_col=0, header=None, sep=' ').T\ndfQSO = dfData.reset_index(drop=True).dropna()\n# dfQSO.dropna()\n\n#%%\n\ndfDCR = pd.concat([dfZ, dfQSO], axis=1)\ndfDCR.head()\n\n#%%\n\n# Check to make sure that all the entries are aligned (without the reset_index, there was an offset)\ndfDCR.tail()\n\n\n# ---\n\n# The next cell sets the astrometric error in the u and g bands. This needs to be changed for different simulated magnitudes. Someone also needs to determine how the astrometric errors map to u- and g-band magnitudes for LSST\n#\n# For SDSS the astrometric error at r~22 was 0.100 arcsec (Pier et al. 2003).\n#\n# N.B. This error array gets overridden in the functions below!!\n\n#%%\n\nastrometric_error = [0.035, 0.025] # Units: arcsec\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n\n\n# This next cell determines the airmasses and filters from an opSim. It needs to be changed and incorporated into the MAF framework so that many opSims can be tested. It should also assign each of the redshifts above to a random position on the sky. Ideally more like each redshift gets ~100 random positions.\n#\n# For now, just using the one random position once for each redshift.\n\n#%%\n\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n\n# Extract the airmass and filters for each observation\n\n#%%\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#%%\n\nprint(len(airmasses), len(filters))\nprint(airmasses[:5])\nprint(filters[:5])\n\n\n# ## generate observed slopes from true slopes and observations\n\n# *lnlike* calculates the loglikelihood, *lnprior* creates a prior on our linear fits, *lnprob* adds the prior to lnlike\n#\n# *run_fit* runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the difference between 16th and 84th percentiles as the error.\n#\n# This mcmc approach for the linear regression problem is unnecessarily complicated/slow for the purpose for which it is being used.\n#\n# N.B. run_fit is computing the slope in the offset vs. tanZ plane for a **single** object\n\n#%%\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# *dcrSlopeCalc* is computing the slope in the offset vs. tanZ plane for **all** the objects, calling *run_fit* for each\n\n#%%\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n # Note that the next line overrides the cell above!!\n astrometric_error = [0.035, 0.025]\n\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# The next cell actually calls the code that computes the slopes.\n#\n# This is taking every object in the test set and treating them as if they were observed at the same position on the sky from the simulation. We need to change that.\n\n#%%\n\nobs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr = dcrSlopeCalc(\n airmasses, filters, dfQSO, makePlot=True)\n\n\n# The next cell makes a plot of the predicted DCR slope for all of the test objects and overplots that information on a plot of the true DCR slopes.\n\n#%%\n\nsort_indices = np.argsort(dfDCR['zshifts'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(dfDCR['zshifts'][sort_indices], dfDCR['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(dfDCR['zshifts'][sort_indices], obs_slopes_u[sort_indices],\n color='black', label='Observed u slope', alpha=0.7)\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(dfDCR['zshifts'][sort_indices], dfDCR['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(dfDCR['zshifts'][sort_indices], obs_slopes_g[sort_indices],\n color='black', label='Observed g slope', alpha=0.7)\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\nplt.savefig('dcr1510.png')\n\n\n# ---\n\n# The above shows the slopes calculated for each object in each reshift bin.\n#\n# *slopeProgressionCalcDF* computes how the slope predictions change wtih each new observation. We are going to compute this for each object, so this will take quite some time. For right now, each object is treated as being at the same point on the sky. Note that the way this is coded, it won't work for objects with different number of observations (i.e., different positions on the sky).\n\n#%%\n\n# N.B. makePlot=True generates a LOT of plots. One for each observation. Use with care!\n# This is NOT fast.\ndef slopeProgressionCalcDF(airmasses, filters, test_quasars):\n dfSlopes_u = pd.DataFrame()\n dfSlopes_g = pd.DataFrame()\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n diff_array_u = []\n diff_array_g = []\n num_obs_array_u = []\n num_obs_array_g = []\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n # print(i)\n slopes_array_u = []\n slopes_array_g = []\n redshift = test_quasars['zshifts'][i]\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n print(i, redshift, true_slope_u, true_slope_g)\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n for j, airmass in enumerate(airmasses):\n # print(j,airmasses[j],filters[j])\n # print(j,airmasses,filters)\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n #print(\"tan Z\",tanZ_obs)\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n #print(\"R_obs u\",R_obs)\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n #print(\"R_obs g\",R_obs)\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n NumObsPerBand = 2\n # print(len(tanZList_u),len(tanZList_g))\n while ((NumObsPerBand <= len(tanZList_u)) or (NumObsPerBand <= len(tanZList_g))):\n if NumObsPerBand < len(tanZList_g):\n tanZList_g_copy = tanZList_g[:NumObsPerBand]\n RList_g_copy = RList_g[:NumObsPerBand]\n RerrList_g_copy = RerrList_g[:NumObsPerBand]\n else:\n tanZList_g_copy = tanZList_g\n RList_g_copy = RList_g\n RerrList_g_copy = RerrList_g\n if NumObsPerBand < len(tanZList_u):\n tanZList_u_copy = tanZList_u[:NumObsPerBand]\n RList_u_copy = RList_u[:NumObsPerBand]\n RerrList_u_copy = RerrList_u[:NumObsPerBand]\n else:\n tanZList_u_copy = tanZList_u\n RList_u_copy = RList_u\n RerrList_u_copy = RerrList_u\n\n # print(i,j,tanZList_u_copy,RList_u_copy)\n m_mcmc_u, merr_mcmc_u = run_fit(\n tanZList_u_copy, RList_u_copy, RerrList_u_copy)\n m_mcmc_g, merr_mcmc_g = run_fit(\n tanZList_g_copy, RList_g_copy, RerrList_g_copy)\n # End while loop\n\n slopes_array_u = np.append(\n slopes_array_u, abs(m_mcmc_u - true_slope_u))\n slopes_array_g = np.append(\n slopes_array_g, abs(m_mcmc_g - true_slope_g))\n NumObsPerBand += 1\n # print(i,slopes_array_u)\n # End airmass loop\n dfSlopes_u[i] = slopes_array_u\n dfSlopes_g[i] = slopes_array_g\n # End quasar loop\n return dfSlopes_u, dfSlopes_g\n\n\n# The next cell calls the function above. Right now just using 5 objects since it takes a LONG time to run. This needs to be fixed.\n\n#%%\n\ndfSlopes_u, dfSlopes_g = slopeProgressionCalcDF(airmasses, filters, dfDCR[:5])", "target_code": "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))\n\ndfSlopes_u.plot(ax=ax1, legend=False)\nax1.set_xlabel('u-band Epochs')\nax1.set_ylabel('Error in DCR Slope')\n\ndfSlopes_g.plot(ax=ax2, legend=False)\nax2.set_xlabel('g-band Epochs')\nax2.set_ylabel('Error in DCR Slope')\nplt.savefig('DCRprogression.png')\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DCR AGN Metric Analysis\n#\n# G. Richards, B. Martin, W. Yu, C. Peters (August 2020)\n\n# This notebook takes simulated quasar colors and DCR slopes. It takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from all the observations is compared to the \"known\" slope.\n#\n# This needs a fair bit of cleaning up and documenting. It is also unnecessarily slow. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ### open files with redshift bins, colors, and DCR slopes\n#\n# For historical reasons the redshift bins and the colors and DCR slopes are stored in separate data files. These next few cells just merges them together into a single Pandas DataFrame.\n\n\n# Table of redshift bins where we have computed the mean colors and DCR slopes.\nzfile = 'fittingS82_zshifts.dat'\ndfZ = pd.read_csv(zfile)\n# dfZ.reset_index(drop=True)\ndfZ.head()\n\n\n# Table of colors and DCR slopes for the above redshifts\ndatafile = 'fittingS82_zshiftfit.dat'\ndfData = pd.read_csv(datafile, index_col=0, header=None, sep=' ').T\ndfQSO = dfData.reset_index(drop=True).dropna()\n# dfQSO.dropna()\n\n\ndfDCR = pd.concat([dfZ, dfQSO], axis=1)\ndfDCR.head()\n\n\n# Check to make sure that all the entries are aligned (without the reset_index, there was an offset)\ndfDCR.tail()\n\n\n# ---\n\n# The next cell sets the astrometric error in the u and g bands. This needs to be changed for different simulated magnitudes. Someone also needs to determine how the astrometric errors map to u- and g-band magnitudes for LSST\n#\n# For SDSS the astrometric error at r~22 was 0.100 arcsec (Pier et al. 2003).\n#\n# N.B. This error array gets overridden in the functions below!!\n\n\nastrometric_error = [0.035, 0.025] # Units: arcsec\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n\n\n# This next cell determines the airmasses and filters from an opSim. It needs to be changed and incorporated into the MAF framework so that many opSims can be tested. It should also assign each of the redshifts above to a random position on the sky. Ideally more like each redshift gets ~100 random positions.\n#\n# For now, just using the one random position once for each redshift.\n\n\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n\n# Extract the airmass and filters for each observation\n\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n\nprint(len(airmasses), len(filters))\nprint(airmasses[:5])\nprint(filters[:5])\n\n\n# ## generate observed slopes from true slopes and observations\n\n# *lnlike* calculates the loglikelihood, *lnprior* creates a prior on our linear fits, *lnprob* adds the prior to lnlike\n#\n# *run_fit* runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the difference between 16th and 84th percentiles as the error.\n#\n# This mcmc approach for the linear regression problem is unnecessarily complicated/slow for the purpose for which it is being used.\n#\n# N.B. run_fit is computing the slope in the offset vs. tanZ plane for a **single** object\n\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# *dcrSlopeCalc* is computing the slope in the offset vs. tanZ plane for **all** the objects, calling *run_fit* for each\n\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n # Note that the next line overrides the cell above!!\n astrometric_error = [0.035, 0.025]\n\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# The next cell actually calls the code that computes the slopes.\n#\n# This is taking every object in the test set and treating them as if they were observed at the same position on the sky from the simulation. We need to change that.\n\n\nobs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr = dcrSlopeCalc(\n airmasses, filters, dfQSO, makePlot=True)\n\n\n# The next cell makes a plot of the predicted DCR slope for all of the test objects and overplots that information on a plot of the true DCR slopes.\n\n\nsort_indices = np.argsort(dfDCR['zshifts'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(dfDCR['zshifts'][sort_indices], dfDCR['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(dfDCR['zshifts'][sort_indices], obs_slopes_u[sort_indices],\n color='black', label='Observed u slope', alpha=0.7)\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(dfDCR['zshifts'][sort_indices], dfDCR['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(dfDCR['zshifts'][sort_indices], obs_slopes_g[sort_indices],\n color='black', label='Observed g slope', alpha=0.7)\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\nplt.savefig('dcr1510.png')\n\n\n# ---\n\n# The above shows the slopes calculated for each object in each reshift bin.\n#\n# *slopeProgressionCalcDF* computes how the slope predictions change wtih each new observation. We are going to compute this for each object, so this will take quite some time. For right now, each object is treated as being at the same point on the sky. Note that the way this is coded, it won't work for objects with different number of observations (i.e., different positions on the sky).\n\n\n# N.B. makePlot=True generates a LOT of plots. One for each observation. Use with care!\n# This is NOT fast.\ndef slopeProgressionCalcDF(airmasses, filters, test_quasars):\n dfSlopes_u = pd.DataFrame()\n dfSlopes_g = pd.DataFrame()\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n diff_array_u = []\n diff_array_g = []\n num_obs_array_u = []\n num_obs_array_g = []\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n # print(i)\n slopes_array_u = []\n slopes_array_g = []\n redshift = test_quasars['zshifts'][i]\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n print(i, redshift, true_slope_u, true_slope_g)\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n for j, airmass in enumerate(airmasses):\n # print(j,airmasses[j],filters[j])\n # print(j,airmasses,filters)\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n #print(\"tan Z\",tanZ_obs)\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n #print(\"R_obs u\",R_obs)\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n #print(\"R_obs g\",R_obs)\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n NumObsPerBand = 2\n # print(len(tanZList_u),len(tanZList_g))\n while ((NumObsPerBand <= len(tanZList_u)) or (NumObsPerBand <= len(tanZList_g))):\n if NumObsPerBand < len(tanZList_g):\n tanZList_g_copy = tanZList_g[:NumObsPerBand]\n RList_g_copy = RList_g[:NumObsPerBand]\n RerrList_g_copy = RerrList_g[:NumObsPerBand]\n else:\n tanZList_g_copy = tanZList_g\n RList_g_copy = RList_g\n RerrList_g_copy = RerrList_g\n if NumObsPerBand < len(tanZList_u):\n tanZList_u_copy = tanZList_u[:NumObsPerBand]\n RList_u_copy = RList_u[:NumObsPerBand]\n RerrList_u_copy = RerrList_u[:NumObsPerBand]\n else:\n tanZList_u_copy = tanZList_u\n RList_u_copy = RList_u\n RerrList_u_copy = RerrList_u\n\n # print(i,j,tanZList_u_copy,RList_u_copy)\n m_mcmc_u, merr_mcmc_u = run_fit(\n tanZList_u_copy, RList_u_copy, RerrList_u_copy)\n m_mcmc_g, merr_mcmc_g = run_fit(\n tanZList_g_copy, RList_g_copy, RerrList_g_copy)\n # End while loop\n\n slopes_array_u = np.append(\n slopes_array_u, abs(m_mcmc_u - true_slope_u))\n slopes_array_g = np.append(\n slopes_array_g, abs(m_mcmc_g - true_slope_g))\n NumObsPerBand += 1\n # print(i,slopes_array_u)\n # End airmass loop\n dfSlopes_u[i] = slopes_array_u\n dfSlopes_g[i] = slopes_array_g\n # End quasar loop\n return dfSlopes_u, dfSlopes_g\n\n\n# The next cell calls the function above. Right now just using 5 objects since it takes a LONG time to run. This needs to be fixed.\n\n\ndfSlopes_u, dfSlopes_g = slopeProgressionCalcDF(airmasses, filters, dfDCR[:5])\n\n\n\n", "project_metadata": {"full_name": "RichardsGroup/LSSTprep", "description": "Repository for Richards group LSST prep work, specifically related to the AGN SC", "topics": [], "git_url": "git://github.com/RichardsGroup/LSSTprep.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2018-06-20T20:43:08Z", "size": 30265, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8424521, "Python": 6419}, "last_updated": "2020-09-28T18:32:02Z"}, "intent": "# Plot the results from the above."}, {"original_comment": "# Determise the size of each image\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # [Whale Classification Model](https://www.kaggle.com/martinpiotte/whale-recognition-model-with-score-0-78563/notebook)\n# This notebook describes the strategy behind the 0.78563 submission to the Humpack Whale identification Challenge.\n#\n# It should be studied in conjunction with the [Bounding Box Model](http://www.kaggle.com/martinpiotte/bounding-box-model) notebook which describes separately the strategy for image cropping.\n#\n# To speed things up, the results of some slow computations are included as a dataset instead of being recomputed here. However, the code is still provided in the notebook as reference, even if it is not executed by default.\n#\n# >**[Bounding Box Model](http://www.kaggle.com/martinpiotte/bounding-box-model) \u7528\u4e8e\u88c1\u526a\u65cb\u8f6c\u540e\u7684\u56fe\u7247**\n\n# # Abstract\n# The approach used for this submission is essentially a [Siamese Neural Network](http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf), with a few modifications that will be covered in details later. The element that generated the largest accuracy improvement is the procedure used to generate image pairs during training. Each training epoch is composed of a sequence of image pairs (A, B) such that:\n#\n# * Exactly 50% of the pairs are for matching whales, and 50% for different whales;\n# * Each image from the training set is used exactly 4 times per epoch: A and B images of matching whales, A and B images of different whale pairs;\n# * Pairs of images of different whales are selected to be difficult for the network to distinguish at a given stage of the training. This is inspired from adversarial training: find pairs of images that are from different whales, but that are still very similar from the model perspective.\n#\n# Implementing this strategy while training a Siamese Neural Network is what makes the largest contribution to the model accuracy. Other details contribute somewhat to the accuracy, but have a much smaller impact.\n#\n# >**\u6a21\u578b\u57fa\u4e8e[Siamese Neural Network](http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf)\uff1b
\n# \u6a21\u578b\u6700\u5927\u7684\u63d0\u5347\u662f\u751f\u6210\u8bad\u7ec3\u5bf9\u7684\u8fc7\u7a0b\uff0c
\n# \u6b63\u597d50%\u7684\u5bf9\u4e3a\u540c\u7c7b\u56fe\u7247\uff0c\u53e650%\u4e3a\u4e0d\u540c\u79cd\u7c7b\u7684\u56fe\u7247\uff1b
\n# \u6bcf\u5f20\u56fe\u7247\u5728\u6bcf\u4e2aepoch\u4f1a\u6b63\u597d\u88ab\u9009\u52304\u6b21\uff1aA and B images of matching whales, A and B images of different whale pairs\uff1b
\n# \u7528\u4e8e\u8bad\u7ec3\u7684\u8bad\u7ec3\u5bf9\uff0c\u5e94\u8be5\u5c3d\u53ef\u80fd\u96be\u4e8e\u8bad\u7ec3\uff0c\u5e94\u8be5\u662f\u5c5e\u4e8e\u4e0d\u540c\u79cd\u7c7b\u800c\u76ee\u524d\u6a21\u578b\u8f93\u51fa\u5411\u91cf\u76f8\u4f3c\u7684\u56fe\u7247\u5bf9\n# **\n\n# # Overview\n# This notebook describes all the different elements of the submission. Obviously, to cover everything, it has to be fairly long. I encourage everyone to skip ahead directly to whatever you are most interested in, without necessarily going through everything.\n# ## Content\n# 1. Duplicate image identification (not much to see here -- keep moving)\n# 1. Image preprocessing (just the regular stuff)\n# 1. Siamese Neural Network architecture (some interesting thoughts)\n# 1. Training data construction (most of the secret sauce is here)\n# 1. Training procedure (zzzzz.....)\n# 1. Generating the submission file (re-zzzzz.....)\n# 1. Bootstrapping and ensemble (classic but short)\n# 1. Visualization (everyone's favorite!)\n# 1. Off topic (why add this unless it is interesting?)\n#\n# >**1.\u76f8\u540c\u56fe\u7247\u8bc6\u522b\uff08phash \u54c8\u5e0c\u611f\u77e5\uff09
\n# 2.\u56fe\u7247\u9884\u5904\u7406
\n# 3.\u5b6a\u751f\u7f51\u7edc\u7ed3\u6784
\n# 4.\u8bad\u7ec3\u6570\u636e\u7684\u6784\u9020\uff08\u6700\u6709\u7528\u7684\u4e00\u6b65\uff09
\n# 5.\u8bad\u7ec3\u8fc7\u7a0b
\n# 6.\u751f\u6210\u63d0\u4ea4\u6587\u4ef6
\n# 7.\u62d4\u9774\u6cd5\u548c\u6a21\u578b\u878d\u5408\uff08\u7ecf\u5178\u4e14\u7b80\u7ec3\uff09
\n# 8.\u53ef\u89c6\u5316
\n# 9.\u8c08\u8c08\u5176\u4ed6\u2026\u2026
**\n#\n\n# # Duplicate image identification\n# This section describes the heuristic used to identify duplicate images. The fact that the training and test set have duplicate images has already been well documented. Some images are perfect binary copies, while other have been altered somewhat: contrast and brightness, size, masking the legend, etc.\n# >** \u8bad\u7ec3\u96c6\u548c\u6d4b\u8bd5\u96c6\u4e2d\u5b58\u5728\u91cd\u590d\u56fe\u7247**\n#\n# Two images are considered duplicate if they meet the following criteria:\n#\n# 1. Both images have the same [Perceptual Hash](http://www.phash.org/) (phash); or\n# 1. Both images have:\n# 1. phash that differ by at most 6 bits, and;\n# 1. have the same size, and;\n# 1. the pixelwise mean square error between the normalized images is below a given threshold.\n#\n# >**\u6ee1\u8db3\u4ee5\u4e0b\u6761\u4ef6\u7684\u4e24\u5f20\u56fe\u7247\u88ab\u8ba4\u4e3a\u662f\u91cd\u590d\u56fe\u7247
\n# 1.\u4e24\u5f20\u56fe\u7247\u5177\u6709\u76f8\u540c\u7684[Perceptual Hash](http://www.phash.org/) (phash)
\n# 2.\u4e24\u5f20\u56fe\u7247\u6ee1\u8db3\uff1a
\n# \uff081\uff09phash\u8ddd\u79bb\u5c0f\u4e8e6bits\uff0c\u5e76\u4e14
\n# \uff082\uff09\u56fe\u7247\u5177\u6709\u76f8\u540c\u7684\u5c3a\u5bf8\uff0c\u5e76\u4e14
\n# \uff083\uff09\u5f52\u4e00\u5316\u540e\u7684\u56fe\u7247\u7684\u50cf\u7d20\u7ea7\u7684\u5747\u65b9\u8bef\u5dee\u5c0f\u4e8e\u7ed9\u5b9a\u7684threshold
**\n#\n# The *p2h* dictionary associate a unique image id (phash) for each picture. The *h2p* dictionary associate each unique image id to the prefered image to be used for this hash.\n#\n# The prefered image is the one with the highest resolution, or any one if they have the same resolution.\n\n#%%\n\n# Read the dataset description\nfrom keras.preprocessing.image import img_to_array, array_to_img\nfrom math import sqrt\nfrom scipy.ndimage import gaussian_filter\nfrom matplotlib.ticker import MaxNLocator\nfrom matplotlib.colors import BoundaryNorm\nfrom matplotlib import cm\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom keras_tqdm import TQDMNotebookCallback\nfrom keras.utils import Sequence\nfrom keras.utils import plot_model\nfrom keras.models import Model\nfrom keras.layers import Activation, Add, BatchNormalization, Concatenate, Conv2D, Dense, Flatten, GlobalMaxPooling2D, Lambda, MaxPooling2D, Reshape\nfrom keras.engine.topology import Input\nfrom keras.optimizers import Adam\nfrom keras import regularizers\nfrom scipy.ndimage import affine_transform\nfrom keras import backend as K\nimport random\nimport keras\nimport sys\nimport matplotlib.pyplot as plt\nfrom imagehash import phash\nimport numpy as np\nimport pickle\nfrom tqdm import tqdm_notebook\nfrom PIL import Image as pil_image\nfrom os.path import isfile\nfrom pandas import read_csv\n\ntagged = dict([(p, w) for _, p, w in read_csv(\n '../input/whale-categorization-playground/train.csv').to_records()])\nsubmit = [p for _, p, _ in read_csv(\n '../input/whale-categorization-playground/sample_submission.csv').to_records()]\njoin = list(tagged.keys()) + submit\nlen(tagged), len(submit), len(join), list(tagged.items())[:5], submit[:5]\n\n#%%", "target_code": "from tqdm import tqdm_notebook\nfrom PIL import Image as pil_image\n\np2size = {}\nfor p in tqdm_notebook(join):\n size = pil_image.open(expand_path(p)).size\n p2size[p] = size\nlen(p2size), list(p2size.items())[:5]\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # [Whale Classification Model](https://www.kaggle.com/martinpiotte/whale-recognition-model-with-score-0-78563/notebook)\n# This notebook describes the strategy behind the 0.78563 submission to the Humpack Whale identification Challenge.\n#\n# It should be studied in conjunction with the [Bounding Box Model](http://www.kaggle.com/martinpiotte/bounding-box-model) notebook which describes separately the strategy for image cropping.\n#\n# To speed things up, the results of some slow computations are included as a dataset instead of being recomputed here. However, the code is still provided in the notebook as reference, even if it is not executed by default.\n#\n# >**[Bounding Box Model](http://www.kaggle.com/martinpiotte/bounding-box-model) \u7528\u4e8e\u88c1\u526a\u65cb\u8f6c\u540e\u7684\u56fe\u7247**\n\n# # Abstract\n# The approach used for this submission is essentially a [Siamese Neural Network](http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf), with a few modifications that will be covered in details later. The element that generated the largest accuracy improvement is the procedure used to generate image pairs during training. Each training epoch is composed of a sequence of image pairs (A, B) such that:\n#\n# * Exactly 50% of the pairs are for matching whales, and 50% for different whales;\n# * Each image from the training set is used exactly 4 times per epoch: A and B images of matching whales, A and B images of different whale pairs;\n# * Pairs of images of different whales are selected to be difficult for the network to distinguish at a given stage of the training. This is inspired from adversarial training: find pairs of images that are from different whales, but that are still very similar from the model perspective.\n#\n# Implementing this strategy while training a Siamese Neural Network is what makes the largest contribution to the model accuracy. Other details contribute somewhat to the accuracy, but have a much smaller impact.\n#\n# >**\u6a21\u578b\u57fa\u4e8e[Siamese Neural Network](http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf)\uff1b
\n# \u6a21\u578b\u6700\u5927\u7684\u63d0\u5347\u662f\u751f\u6210\u8bad\u7ec3\u5bf9\u7684\u8fc7\u7a0b\uff0c
\n# \u6b63\u597d50%\u7684\u5bf9\u4e3a\u540c\u7c7b\u56fe\u7247\uff0c\u53e650%\u4e3a\u4e0d\u540c\u79cd\u7c7b\u7684\u56fe\u7247\uff1b
\n# \u6bcf\u5f20\u56fe\u7247\u5728\u6bcf\u4e2aepoch\u4f1a\u6b63\u597d\u88ab\u9009\u52304\u6b21\uff1aA and B images of matching whales, A and B images of different whale pairs\uff1b
\n# \u7528\u4e8e\u8bad\u7ec3\u7684\u8bad\u7ec3\u5bf9\uff0c\u5e94\u8be5\u5c3d\u53ef\u80fd\u96be\u4e8e\u8bad\u7ec3\uff0c\u5e94\u8be5\u662f\u5c5e\u4e8e\u4e0d\u540c\u79cd\u7c7b\u800c\u76ee\u524d\u6a21\u578b\u8f93\u51fa\u5411\u91cf\u76f8\u4f3c\u7684\u56fe\u7247\u5bf9\n# **\n\n# # Overview\n# This notebook describes all the different elements of the submission. Obviously, to cover everything, it has to be fairly long. I encourage everyone to skip ahead directly to whatever you are most interested in, without necessarily going through everything.\n# ## Content\n# 1. Duplicate image identification (not much to see here -- keep moving)\n# 1. Image preprocessing (just the regular stuff)\n# 1. Siamese Neural Network architecture (some interesting thoughts)\n# 1. Training data construction (most of the secret sauce is here)\n# 1. Training procedure (zzzzz.....)\n# 1. Generating the submission file (re-zzzzz.....)\n# 1. Bootstrapping and ensemble (classic but short)\n# 1. Visualization (everyone's favorite!)\n# 1. Off topic (why add this unless it is interesting?)\n#\n# >**1.\u76f8\u540c\u56fe\u7247\u8bc6\u522b\uff08phash \u54c8\u5e0c\u611f\u77e5\uff09
\n# 2.\u56fe\u7247\u9884\u5904\u7406
\n# 3.\u5b6a\u751f\u7f51\u7edc\u7ed3\u6784
\n# 4.\u8bad\u7ec3\u6570\u636e\u7684\u6784\u9020\uff08\u6700\u6709\u7528\u7684\u4e00\u6b65\uff09
\n# 5.\u8bad\u7ec3\u8fc7\u7a0b
\n# 6.\u751f\u6210\u63d0\u4ea4\u6587\u4ef6
\n# 7.\u62d4\u9774\u6cd5\u548c\u6a21\u578b\u878d\u5408\uff08\u7ecf\u5178\u4e14\u7b80\u7ec3\uff09
\n# 8.\u53ef\u89c6\u5316
\n# 9.\u8c08\u8c08\u5176\u4ed6\u2026\u2026
**\n#\n\n# # Duplicate image identification\n# This section describes the heuristic used to identify duplicate images. The fact that the training and test set have duplicate images has already been well documented. Some images are perfect binary copies, while other have been altered somewhat: contrast and brightness, size, masking the legend, etc.\n# >** \u8bad\u7ec3\u96c6\u548c\u6d4b\u8bd5\u96c6\u4e2d\u5b58\u5728\u91cd\u590d\u56fe\u7247**\n#\n# Two images are considered duplicate if they meet the following criteria:\n#\n# 1. Both images have the same [Perceptual Hash](http://www.phash.org/) (phash); or\n# 1. Both images have:\n# 1. phash that differ by at most 6 bits, and;\n# 1. have the same size, and;\n# 1. the pixelwise mean square error between the normalized images is below a given threshold.\n#\n# >**\u6ee1\u8db3\u4ee5\u4e0b\u6761\u4ef6\u7684\u4e24\u5f20\u56fe\u7247\u88ab\u8ba4\u4e3a\u662f\u91cd\u590d\u56fe\u7247
\n# 1.\u4e24\u5f20\u56fe\u7247\u5177\u6709\u76f8\u540c\u7684[Perceptual Hash](http://www.phash.org/) (phash)
\n# 2.\u4e24\u5f20\u56fe\u7247\u6ee1\u8db3\uff1a
\n# \uff081\uff09phash\u8ddd\u79bb\u5c0f\u4e8e6bits\uff0c\u5e76\u4e14
\n# \uff082\uff09\u56fe\u7247\u5177\u6709\u76f8\u540c\u7684\u5c3a\u5bf8\uff0c\u5e76\u4e14
\n# \uff083\uff09\u5f52\u4e00\u5316\u540e\u7684\u56fe\u7247\u7684\u50cf\u7d20\u7ea7\u7684\u5747\u65b9\u8bef\u5dee\u5c0f\u4e8e\u7ed9\u5b9a\u7684threshold
**\n#\n# The *p2h* dictionary associate a unique image id (phash) for each picture. The *h2p* dictionary associate each unique image id to the prefered image to be used for this hash.\n#\n# The prefered image is the one with the highest resolution, or any one if they have the same resolution.\n\n\n# Read the dataset description\nfrom keras.preprocessing.image import img_to_array, array_to_img\nfrom math import sqrt\nfrom scipy.ndimage import gaussian_filter\nfrom matplotlib.ticker import MaxNLocator\nfrom matplotlib.colors import BoundaryNorm\nfrom matplotlib import cm\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom keras_tqdm import TQDMNotebookCallback\nfrom keras.utils import Sequence\nfrom keras.utils import plot_model\nfrom keras.models import Model\nfrom keras.layers import Activation, Add, BatchNormalization, Concatenate, Conv2D, Dense, Flatten, GlobalMaxPooling2D, Lambda, MaxPooling2D, Reshape\nfrom keras.engine.topology import Input\nfrom keras.optimizers import Adam\nfrom keras import regularizers\nfrom scipy.ndimage import affine_transform\nfrom keras import backend as K\nimport random\nimport keras\nimport sys\nimport matplotlib.pyplot as plt\nfrom imagehash import phash\nimport numpy as np\nimport pickle\nfrom os.path import isfile\nfrom pandas import read_csv\n\ntagged = dict([(p, w) for _, p, w in read_csv(\n '../input/whale-categorization-playground/train.csv').to_records()])\nsubmit = [p for _, p, _ in read_csv(\n '../input/whale-categorization-playground/sample_submission.csv').to_records()]\njoin = list(tagged.keys()) + submit\nlen(tagged), len(submit), len(join), list(tagged.items())[:5], submit[:5]\n\n\n\ndef expand_path(p):\n if isfile('../input/whale-categorization-playground/train/' + p):\n return '../input/whale-categorization-playground/train/' + p\n if isfile('../input/whale-categorization-playground/test/' + p):\n return '../input/whale-categorization-playground/test/' + p\n return p\n\n\n", "project_metadata": {"full_name": "cy0616/kaggle-competition-Humpback-Whale-Identification-Challenge", "description": "kaggle competition Humpback Whale Identification Challenge", "topics": [], "git_url": "git://github.com/cy0616/kaggle-competition-Humpback-Whale-Identification-Challenge.git", "stars": 5, "watchers": 5, "forks": 4, "created": "2018-07-13T09:09:52Z", "size": 22424, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 349590, "Python": 39346}, "last_updated": "2019-03-02T03:17:43Z"}, "intent": "# Determise the size of each image"}, {"original_comment": "# Inspect the new WaveDrom structure\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PYNQ Project Proposal\n\n# ##### __Proposer:__ Patrick Lysaght (patrick**dot**lysaght**at**xilinx**dot**com)\n# __Version:__ 1.0\n# __Date:__ 5 Oct 2017\n\n# ## Title\n\n# Automatically annotate state labels to waveforms captured from executed FSMs in `logictools`\n\n# ### Abstract\n\n# Add support for displaying state labels to the waveforms generated by the FSM Generator in `logictools`\n\n# ### Motivation\n\n# The waveforms captured with the `logictools` Trace Analyzer can show the sequential operation a FSM. We can capture and display the behavior of an FSM, as it sequences through a given set of its possible states, in response to any given set of input conditions.\n#\n# Currently, however, the designer, must _manually_ decode each state from its corresponding state representation. This entails checking the signals corresponding to the state bits, and repeatedly matching the values of signals to the appropriate states. This can be a frustrating and error prone exercise.\n#\n# This process can be automated, making it much easier for the designer to interpret a waveform diagram. Each FSM waveform will consist of a particular sequence of states. These can be captured with the `logictools` Trace Analyzer, which can monitor an FSM as it executes.\n#\n# Automatically annotating state labels to a waveform captured from a running FSM, provides both a useful pedagogical tool, as well as, a helpful debug tool.\n#\n# Having the FSM states automatically decoded and annotated on to the waveform diagram, helps when debugging by making it easier to:\n# * visualize the operation of a FSM\n# * confirm that the intended FSM has been properly specified, or\n# * locate errors in the FSM operation caused either by improper specification, or unexpected operating conditions\n\n# ### Skill levels required for project\n\n# __Python:__ intermediate\n#\n# __Digital logic:__ intermediate\n#\n# __WaveDrom__: intermediate\n\n# ### Example\n\n# _`logictools`_ can capture the operation of a FSM. It can then display the operation as a waveform. This is achieved using the Trace Analyzer to monitor the operation of the FSM Generator, _under the hood_. The Wavedrom library is also invoked, _under the hood_, to display the waveform. The following script provides an example of a 3-bit, up/down Gray code counter:\n\n#%%\n\nfrom pynq.lib.logictools import Waveform\nfrom pprint import pprint\nfrom pynq.overlays.logictools import LogicToolsOverlay\n\n\nlogictools_olay = LogicToolsOverlay('logictools.bit')\n\n# Specify FSM to generate\ngray_cntr_spec = {'inputs': [('reset', 'D0'), ('direction', 'D1')],\n 'outputs': [('bit2', 'D3'), ('bit1', 'D4'), ('bit0', 'D5')],\n 'states': ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7'],\n 'transitions': [['01', 'S0', 'S1', '000'],\n ['00', 'S0', 'S7', '000'],\n ['01', 'S1', 'S2', '001'],\n ['00', 'S1', 'S0', '001'],\n ['01', 'S2', 'S3', '011'],\n ['00', 'S2', 'S1', '011'],\n ['01', 'S3', 'S4', '010'],\n ['00', 'S3', 'S2', '010'],\n ['01', 'S4', 'S5', '110'],\n ['00', 'S4', 'S3', '110'],\n ['01', 'S5', 'S6', '100'],\n ['00', 'S5', 'S4', '100'],\n ['01', 'S6', 'S7', '101'],\n ['00', 'S6', 'S5', '101'],\n ['01', 'S7', 'S0', '111'],\n ['00', 'S7', 'S6', '111'],\n ['1-', '*', 'S0', '']]}\n\n# Instantiate fsm_gen\nfsm_gen = logictools_olay.fsm_generator\n# Set trace to 16 samples initially\nfsm_gen.trace(num_analyzer_samples=16)\n# Configure fsm_gen with _spec\nfsm_gen.setup(gray_cntr_spec)\n# Run FSM, capture operation, and display waveform\nfsm_gen.run()\nfsm_gen.show_waveform()\n\n\n# By default, the trace waveform show the FSM inputs and the FSM outputs. To see the state vector bits, we need add an extra parameter in the `setup method` as follows:\n\n#%%\n\n# Reset fsm_gen before invoking setup again\nfsm_gen.reset()\n\n# Configure fsm_gen with _spec\nfsm_gen.setup(gray_cntr_spec, use_state_bits=True)\n# Run FSM, capture operation, and display waveform\nfsm_gen.run()\nfsm_gen.show_waveform()\n\n\n# We can access the data dictionary that was used to generate this waveform with the following script:\n\n#%%\n\nfsm_gen_wave_dict = fsm_gen.waveform.waveform_dict\npprint(fsm_gen_wave_dict)\n\n\n# This format of the data returned is one which is specified by the `WaveDrom` library. This is because, internally, `Waveform` objects are represented using `WaveDrom` format, and rendered using `WaveDrom` methods.\n#\n# The format is referred to as WaveDrom's WaveJSON format. For more details on how this works, refer to the [WaveDrom waveform tutorial](http://wavedrom.com/tutorial.html)\n#\n# Since this data is simply a WaveJSON dict, we can extend it with any additional annotations that we would find helpful. In this case, we want to add the label for each state the FSM sequences through.\n\n#%%\n\n# Let's annotate the first 16 states manually as an example\nexpected_state_seq = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7',\n 'S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7']\nno_of_states = len(expected_state_seq)\n\nfsm_gen_wave_dict['signal'].extend([{},\n ['FSM states',\n {},\n {'name': 'state_label',\n 'wave': '2' * no_of_states,\n 'data': expected_state_seq},\n {}\n ]])", "target_code": "pprint(fsm_gen_wave_dict)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PYNQ Project Proposal\n\n# ##### __Proposer:__ Patrick Lysaght (patrick**dot**lysaght**at**xilinx**dot**com)\n# __Version:__ 1.0\n# __Date:__ 5 Oct 2017\n\n# ## Title\n\n# Automatically annotate state labels to waveforms captured from executed FSMs in `logictools`\n\n# ### Abstract\n\n# Add support for displaying state labels to the waveforms generated by the FSM Generator in `logictools`\n\n# ### Motivation\n\n# The waveforms captured with the `logictools` Trace Analyzer can show the sequential operation a FSM. We can capture and display the behavior of an FSM, as it sequences through a given set of its possible states, in response to any given set of input conditions.\n#\n# Currently, however, the designer, must _manually_ decode each state from its corresponding state representation. This entails checking the signals corresponding to the state bits, and repeatedly matching the values of signals to the appropriate states. This can be a frustrating and error prone exercise.\n#\n# This process can be automated, making it much easier for the designer to interpret a waveform diagram. Each FSM waveform will consist of a particular sequence of states. These can be captured with the `logictools` Trace Analyzer, which can monitor an FSM as it executes.\n#\n# Automatically annotating state labels to a waveform captured from a running FSM, provides both a useful pedagogical tool, as well as, a helpful debug tool.\n#\n# Having the FSM states automatically decoded and annotated on to the waveform diagram, helps when debugging by making it easier to:\n# * visualize the operation of a FSM\n# * confirm that the intended FSM has been properly specified, or\n# * locate errors in the FSM operation caused either by improper specification, or unexpected operating conditions\n\n# ### Skill levels required for project\n\n# __Python:__ intermediate\n#\n# __Digital logic:__ intermediate\n#\n# __WaveDrom__: intermediate\n\n# ### Example\n\n# _`logictools`_ can capture the operation of a FSM. It can then display the operation as a waveform. This is achieved using the Trace Analyzer to monitor the operation of the FSM Generator, _under the hood_. The Wavedrom library is also invoked, _under the hood_, to display the waveform. The following script provides an example of a 3-bit, up/down Gray code counter:\n\n\nfrom pynq.lib.logictools import Waveform\nfrom pprint import pprint\nfrom pynq.overlays.logictools import LogicToolsOverlay\n\n\nlogictools_olay = LogicToolsOverlay('logictools.bit')\n\n# Specify FSM to generate\ngray_cntr_spec = {'inputs': [('reset', 'D0'), ('direction', 'D1')],\n 'outputs': [('bit2', 'D3'), ('bit1', 'D4'), ('bit0', 'D5')],\n 'states': ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7'],\n 'transitions': [['01', 'S0', 'S1', '000'],\n ['00', 'S0', 'S7', '000'],\n ['01', 'S1', 'S2', '001'],\n ['00', 'S1', 'S0', '001'],\n ['01', 'S2', 'S3', '011'],\n ['00', 'S2', 'S1', '011'],\n ['01', 'S3', 'S4', '010'],\n ['00', 'S3', 'S2', '010'],\n ['01', 'S4', 'S5', '110'],\n ['00', 'S4', 'S3', '110'],\n ['01', 'S5', 'S6', '100'],\n ['00', 'S5', 'S4', '100'],\n ['01', 'S6', 'S7', '101'],\n ['00', 'S6', 'S5', '101'],\n ['01', 'S7', 'S0', '111'],\n ['00', 'S7', 'S6', '111'],\n ['1-', '*', 'S0', '']]}\n\n# Instantiate fsm_gen\nfsm_gen = logictools_olay.fsm_generator\n# Set trace to 16 samples initially\nfsm_gen.trace(num_analyzer_samples=16)\n# Configure fsm_gen with _spec\nfsm_gen.setup(gray_cntr_spec)\n# Run FSM, capture operation, and display waveform\nfsm_gen.run()\nfsm_gen.show_waveform()\n\n\n# By default, the trace waveform show the FSM inputs and the FSM outputs. To see the state vector bits, we need add an extra parameter in the `setup method` as follows:\n\n\n# Reset fsm_gen before invoking setup again\nfsm_gen.reset()\n\n# Configure fsm_gen with _spec\nfsm_gen.setup(gray_cntr_spec, use_state_bits=True)\n# Run FSM, capture operation, and display waveform\nfsm_gen.run()\nfsm_gen.show_waveform()\n\n\n# We can access the data dictionary that was used to generate this waveform with the following script:\n\n\nfsm_gen_wave_dict = fsm_gen.waveform.waveform_dict\npprint(fsm_gen_wave_dict)\n\n\n# This format of the data returned is one which is specified by the `WaveDrom` library. This is because, internally, `Waveform` objects are represented using `WaveDrom` format, and rendered using `WaveDrom` methods.\n#\n# The format is referred to as WaveDrom's WaveJSON format. For more details on how this works, refer to the [WaveDrom waveform tutorial](http://wavedrom.com/tutorial.html)\n#\n# Since this data is simply a WaveJSON dict, we can extend it with any additional annotations that we would find helpful. In this case, we want to add the label for each state the FSM sequences through.\n\n\n# Let's annotate the first 16 states manually as an example\nexpected_state_seq = ['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7',\n 'S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7']\nno_of_states = len(expected_state_seq)\n\nfsm_gen_wave_dict['signal'].extend([{},\n ['FSM states',\n {},\n {'name': 'state_label',\n 'wave': '2' * no_of_states,\n 'data': expected_state_seq},\n {}\n ]])\n", "project_metadata": {"full_name": "drichmond/PYNQ-Hackathon-2017", "description": "General Repository for PYNQ Hackathon Resources", "topics": [], "git_url": "git://github.com/drichmond/PYNQ-Hackathon-2017.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2017-10-06T17:30:20Z", "size": 41244, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 315523}, "last_updated": "2020-10-06T09:57:15Z"}, "intent": "# Inspect structure"}, {"original_comment": "# Trim out highly correlated features\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport yaml\nimport re\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom string import ascii_letters\nfrom utils.write_exp_utils import Experiment\nfrom models.feature_impact_review import train_loop\nfrom utils.orchestra_utils import return_timed_spine, sampler, time_mark_generator, generate_temporal_features, preprocessor, rinse_spines, temporal_split\nfrom utils.misc_utils import connect_rds, calculate_accident_pct\nfrom utils.write_exp_utils import ResultConfig\nfrom sklearn.metrics import *\nimport sys\nimport signal\nfrom datetime import datetime\nfrom random import choice\nfrom itertools import product\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.model_selection import ParameterGrid\nfrom sklearn.ensemble import RandomForestClassifier\nimport sklearn.preprocessing\nfrom sklearn.linear_model import LogisticRegression\nimport numpy as np\nimport psycopg2\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nexperiment_id = 13 # experiment id to refer to\ncorr_thres = 0.75 # absolute correlation threhold to remove highhly correlated features\nk_features = 50 # number of top k features to select\n\n\n# # Feature Selection\n# - Remove highly correltated features\n# - select top k features\n# - save them in yaml format\n#\n# Please edit the experiment_id to generate a list of features based on feature configuration (this configuration should contain PostGres feature tables, make sure that the name/version each table is correct)\n\n#%%\n\nconn = connect_rds()\n\n\nexperiment = Experiment(experiment_id, conn)\nspine_creator = \"\"\"\n-- this would need to be called through a Python Script and replace variables\n-- such as rounding, lag, segmentation table to join and so-forth.\n-- this joins ongeval to hectopunten through the ongeval_hectopunten link\n-- date rounding decides the granularity at which we want to make prediction.\n-- I am not sure if I can do\n\nselect\ndate_trunc('hour', datetime) + date_part('minute', datetime)::int / {0} * interval '{0} min' as datetime_rounded,\nseg.hectokey_merged,\ncount(*) as accidents\nfrom rws_clean.ongevallen as o\ninner join {1} as seg\non seg.hectokey = o.hectokey\ngroup by datetime_rounded, seg.hectokey_merged\n\"\"\".format(experiment.TIME_GRANULARITY, experiment.SEGMENTATION_TABLE)\n\nspine = pd.read_sql(spine_creator, con=conn)\n\nspine_label = return_timed_spine(\n spine, experiment.LABEL_START, experiment.LABEL_END)\nspine_test = return_timed_spine(\n spine, experiment.TEST_LABEL_START, experiment.TEST_LABEL_END)\nprint(\n \"\"\"\n ##################################\n ### CALLING TRAIN_LOOP\n ##################################\n \"\"\")\n\n\nX_train, y_train, X_val, y_val, pre_sample_train = train_loop(\n spine_label=spine_label, spine_test=spine_test, experiment=experiment, conn=conn)\n# close connection\n\nconn.close()\n\n\n# # Features correlation\n\n#%%\n\n# remove extra bool features\nX_train.drop(X_train.filter(regex='_False').columns, axis=1, inplace=True)\n\n#%%\n\nX_train.shape\n\n#%%\n\nsns.set(style=\"white\")\n# Generate a large random dataset\n# rs = np.random.RandomState(33)\n# d = pd.DataFrame(data=rs.normal(size=(100, 26)),\n# columns=list(ascii_letters[26:]))\n\n# Compute the correlation matrix\n#corr = X_train[['flow_avgxlanes_avgseg','flow_avgxlanes_maxseg', 'flow_avgxlanes_minseg','num_lanes_min_mean','num_lanes_max_mean','max_speedlim_day','max_speedlim_night','mean_speedlim_day','mean_speedlim_night','min_speedlim_day','eve_rushhour_True','eve_rushhour_False']].corr()\ncorr = X_train[X_train.columns[:100]].corr()\n# Generate a mask for the upper triangle\nmask = np.zeros_like(corr, dtype=np.bool)\nmask[np.triu_indices_from(mask)] = True\n\n# Set up the matplotlib figure\nf, ax = plt.subplots(figsize=(40, 40))\n#f, ax = plt.subplots()\n\n\n# Generate a custom diverging colormap\ncmap = sns.diverging_palette(220, 10, as_cmap=True)\n\n# Draw the heatmap with the mask and correct aspect ratio\n\nsns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,\n square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n\n#%%\n\ncorr = X_train[X_train.columns[:]].corr()\ncorr_abs = corr.abs()\n\nunstacked_corr = corr_abs.unstack()\nsorted_corr = unstacked_corr.sort_values(kind=\"quicksort\", ascending=False)\n\n#%%\n\n# Absolute correlation score\nfor index, val in sorted_corr[sorted_corr != 1.0][:2000].iteritems():\n print(index, val)\n\n#%%\n\n# Absolute correlation score\nblacklist = list()\nfor index, val in sorted_corr[sorted_corr > corr_thres].iteritems():\n if (index[0] != index[1]):\n if index[0] not in blacklist and index[1] not in blacklist:\n blacklist.append(index[1])\n elif bool(index[0] not in blacklist) != bool(index[1] not in blacklist): # XOR\n pass\nprint(blacklist)\n\n#%%", "target_code": "X_train = X_train.drop(blacklist, errors='ignore', axis=1)\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport yaml\nimport re\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom string import ascii_letters\nfrom utils.write_exp_utils import Experiment\nfrom models.feature_impact_review import train_loop\nfrom utils.orchestra_utils import return_timed_spine, sampler, time_mark_generator, generate_temporal_features, preprocessor, rinse_spines, temporal_split\nfrom utils.misc_utils import connect_rds, calculate_accident_pct\nfrom utils.write_exp_utils import ResultConfig\nfrom sklearn.metrics import *\nimport sys\nimport signal\nfrom datetime import datetime\nfrom random import choice\nfrom itertools import product\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.model_selection import ParameterGrid\nfrom sklearn.ensemble import RandomForestClassifier\nimport sklearn.preprocessing\nfrom sklearn.linear_model import LogisticRegression\nimport numpy as np\nimport psycopg2\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nexperiment_id = 13 # experiment id to refer to\ncorr_thres = 0.75 # absolute correlation threhold to remove highhly correlated features\nk_features = 50 # number of top k features to select\n\n\n# # Feature Selection\n# - Remove highly correltated features\n# - select top k features\n# - save them in yaml format\n#\n# Please edit the experiment_id to generate a list of features based on feature configuration (this configuration should contain PostGres feature tables, make sure that the name/version each table is correct)\n\n\nconn = connect_rds()\n\n\nexperiment = Experiment(experiment_id, conn)\nspine_creator = \"\"\"\n-- this would need to be called through a Python Script and replace variables\n-- such as rounding, lag, segmentation table to join and so-forth.\n-- this joins ongeval to hectopunten through the ongeval_hectopunten link\n-- date rounding decides the granularity at which we want to make prediction.\n-- I am not sure if I can do\n\nselect\ndate_trunc('hour', datetime) + date_part('minute', datetime)::int / {0} * interval '{0} min' as datetime_rounded,\nseg.hectokey_merged,\ncount(*) as accidents\nfrom rws_clean.ongevallen as o\ninner join {1} as seg\non seg.hectokey = o.hectokey\ngroup by datetime_rounded, seg.hectokey_merged\n\"\"\".format(experiment.TIME_GRANULARITY, experiment.SEGMENTATION_TABLE)\n\nspine = pd.read_sql(spine_creator, con=conn)\n\nspine_label = return_timed_spine(\n spine, experiment.LABEL_START, experiment.LABEL_END)\nspine_test = return_timed_spine(\n spine, experiment.TEST_LABEL_START, experiment.TEST_LABEL_END)\nprint(\n \"\"\"\n ##################################\n ### CALLING TRAIN_LOOP\n ##################################\n \"\"\")\n\n\nX_train, y_train, X_val, y_val, pre_sample_train = train_loop(\n spine_label=spine_label, spine_test=spine_test, experiment=experiment, conn=conn)\n# close connection\n\nconn.close()\n\n\n# # Features correlation\n\n\n# remove extra bool features\nX_train.drop(X_train.filter(regex='_False').columns, axis=1, inplace=True)\n\n\nX_train.shape\n\n\nsns.set(style=\"white\")\n# Generate a large random dataset\n# rs = np.random.RandomState(33)\n# d = pd.DataFrame(data=rs.normal(size=(100, 26)),\n# columns=list(ascii_letters[26:]))\n\n# Compute the correlation matrix\n#corr = X_train[['flow_avgxlanes_avgseg','flow_avgxlanes_maxseg', 'flow_avgxlanes_minseg','num_lanes_min_mean','num_lanes_max_mean','max_speedlim_day','max_speedlim_night','mean_speedlim_day','mean_speedlim_night','min_speedlim_day','eve_rushhour_True','eve_rushhour_False']].corr()\ncorr = X_train[X_train.columns[:100]].corr()\n# Generate a mask for the upper triangle\nmask = np.zeros_like(corr, dtype=np.bool)\nmask[np.triu_indices_from(mask)] = True\n\n# Set up the matplotlib figure\nf, ax = plt.subplots(figsize=(40, 40))\n#f, ax = plt.subplots()\n\n\n# Generate a custom diverging colormap\ncmap = sns.diverging_palette(220, 10, as_cmap=True)\n\n# Draw the heatmap with the mask and correct aspect ratio\n\nsns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,\n square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n\n\ncorr = X_train[X_train.columns[:]].corr()\ncorr_abs = corr.abs()\n\nunstacked_corr = corr_abs.unstack()\nsorted_corr = unstacked_corr.sort_values(kind=\"quicksort\", ascending=False)\n\n\n# Absolute correlation score\nfor index, val in sorted_corr[sorted_corr != 1.0][:2000].iteritems():\n print(index, val)\n\n\n# Absolute correlation score\nblacklist = list()\nfor index, val in sorted_corr[sorted_corr > corr_thres].iteritems():\n if (index[0] != index[1]):\n if index[0] not in blacklist and index[1] not in blacklist:\n blacklist.append(index[1])\n elif bool(index[0] not in blacklist) != bool(index[1] not in blacklist): # XOR\n pass\nprint(blacklist)\n\n", "project_metadata": {"full_name": "dssg/rws_accident_prediction_public", "description": "rws_accident_prediction", "topics": [], "git_url": "git://github.com/dssg/rws_accident_prediction_public.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-11-06T15:17:37Z", "size": 26432, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 9921779, "Python": 166306, "PLSQL": 26723, "Shell": 1334}, "last_updated": "2019-04-09T11:19:23Z"}, "intent": "# Trim out blacklisted cases"}, {"original_comment": "# ## Adding Suffixes\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## DBC - Commodities\n\n#%%\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport matplotlib.pyplot as plt\nfrom statsmodels.tsa.stattools import adfuller\nimport numpy as np\nimport seaborn as sns\nimport os\n\n#%%\n\nticker = 'DBC'\n\n#%%\n\nstart = '2007-01-01'\nend = '2019-08-21'\n\n#%%\n\nfinancial_data = web.DataReader(ticker, 'yahoo', start, end)\n\n\n# ## Vizualizating\n\n#%%\n\nfor col in financial_data.columns[:5]:\n plt.figure(figsize=(18, 2))\n plt.title(col)\n financial_data[col].plot()\n\n\n# ## Rolling Mean\n\n#%%\n\nexp_rolmean = financial_data['Close'].ewm(halflife=2).mean()\ndata_minus_exp_rolmean = financial_data['Close'] - exp_rolmean\n\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(data_minus_exp_rolmean, color='blue', label='Sales - rolling mean')\nplt.legend(loc='best')\nplt.title('Price while the rolling mean is subtracted')\nplt.show(block=False)\n\n\n# ## Differentiating\n\n#%%\n\ndata_diff = financial_data['Close'].diff(periods=1)\ndata_diff.head(10)\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(data_diff, color='blue', label='Price - rolling mean')\nplt.legend(loc='best')\nplt.title('Differenced Price Series')\nplt.show(block=False)\n\n\n# ## RSI - Relative Strength Index\n\n#%%\n\ndef RSI(df, col):\n\n values = df[col]\n\n RSI = [0]*14\n\n up, down = [], []\n\n for i in range(0, 14):\n if values[i+1] > values[i]:\n high = values[i+1]-values[i]\n up.append(high)\n elif values[i+1] < values[i]:\n low = values[i]-values[i+1]\n down.append(low)\n\n avup = sum(up)/14\n avdown = sum(down)/14+0.00000000000000001\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n\n RSI.append(rsi)\n\n for row in range(15, len(values)):\n if values[row] > values[row-1]:\n high = values[row]-values[row-1]\n avup = (avup*13+high)/14\n avdown = avdown*13/14\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n RSI.append(rsi)\n\n else:\n low = values[row-1]-values[row]\n avup = (avup*13)/14\n avdown = (avdown*13+low)/14\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n RSI.append(rsi)\n\n df['RSI'] = RSI\n\n#%%\n\nRSI(financial_data, 'Close')\nfinancial_data = financial_data[financial_data.RSI != 0]\n\n#%%\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(financial_data['RSI'], color='blue', label='RSI')\nplt.legend(loc='best')\nplt.axhline(y=30, color='g', linestyle='-')\nplt.axhline(y=70, color='r', linestyle='-')\nplt.title('RSI')\nplt.show(block=False)\n\n\n# ## The Dickey Fuller Test\n\n#%%\n\ndftest = adfuller(financial_data['Close'])\n\n# Extract and display test results in a user friendly manner\ndfoutput = pd.Series(dftest[0:4], index=[\n 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])\nfor key, value in dftest[4].items():\n dfoutput['Critical Value (%s)' % key] = value\n# print(dftest)\nprint('Results of Dickey-Fuller Test:')\nprint(dfoutput)\n\n\n# ## Features Correlations\n\n#%%\n\ncorr = financial_data.corr()\ncorr.style.background_gradient(cmap='coolwarm')\n\n#%%\n\ncorr = financial_data.corr()\ncorr.style.background_gradient(cmap='coolwarm')\n\n#%%\n\nfinancial_data.head()", "target_code": "new_cols = []\nfor c in list(financial_data.columns):\n updated = c+'_'+ticker\n new_cols.append(updated)\nfinancial_data.columns = new_cols\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## DBC - Commodities\n\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport matplotlib.pyplot as plt\nfrom statsmodels.tsa.stattools import adfuller\nimport numpy as np\nimport seaborn as sns\nimport os\n\n\nticker = 'DBC'\n\n\nstart = '2007-01-01'\nend = '2019-08-21'\n\n\nfinancial_data = web.DataReader(ticker, 'yahoo', start, end)\n\n\n# ## Vizualizating\n\n\nfor col in financial_data.columns[:5]:\n plt.figure(figsize=(18, 2))\n plt.title(col)\n financial_data[col].plot()\n\n\n# ## Rolling Mean\n\n\nexp_rolmean = financial_data['Close'].ewm(halflife=2).mean()\ndata_minus_exp_rolmean = financial_data['Close'] - exp_rolmean\n\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(data_minus_exp_rolmean, color='blue', label='Sales - rolling mean')\nplt.legend(loc='best')\nplt.title('Price while the rolling mean is subtracted')\nplt.show(block=False)\n\n\n# ## Differentiating\n\n\ndata_diff = financial_data['Close'].diff(periods=1)\ndata_diff.head(10)\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(data_diff, color='blue', label='Price - rolling mean')\nplt.legend(loc='best')\nplt.title('Differenced Price Series')\nplt.show(block=False)\n\n\n# ## RSI - Relative Strength Index\n\n\ndef RSI(df, col):\n\n values = df[col]\n\n RSI = [0]*14\n\n up, down = [], []\n\n for i in range(0, 14):\n if values[i+1] > values[i]:\n high = values[i+1]-values[i]\n up.append(high)\n elif values[i+1] < values[i]:\n low = values[i]-values[i+1]\n down.append(low)\n\n avup = sum(up)/14\n avdown = sum(down)/14+0.00000000000000001\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n\n RSI.append(rsi)\n\n for row in range(15, len(values)):\n if values[row] > values[row-1]:\n high = values[row]-values[row-1]\n avup = (avup*13+high)/14\n avdown = avdown*13/14\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n RSI.append(rsi)\n\n else:\n low = values[row-1]-values[row]\n avup = (avup*13)/14\n avdown = (avdown*13+low)/14\n RS = avup/avdown\n rsi = 100 - (100/(1+RS))\n RSI.append(rsi)\n\n df['RSI'] = RSI\n\n\nRSI(financial_data, 'Close')\nfinancial_data = financial_data[financial_data.RSI != 0]\n\n\nfig = plt.figure(figsize=(16, 3))\nplt.plot(financial_data['RSI'], color='blue', label='RSI')\nplt.legend(loc='best')\nplt.axhline(y=30, color='g', linestyle='-')\nplt.axhline(y=70, color='r', linestyle='-')\nplt.title('RSI')\nplt.show(block=False)\n\n\n# ## The Dickey Fuller Test\n\n\ndftest = adfuller(financial_data['Close'])\n\n# Extract and display test results in a user friendly manner\ndfoutput = pd.Series(dftest[0:4], index=[\n 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])\nfor key, value in dftest[4].items():\n dfoutput['Critical Value (%s)' % key] = value\n# print(dftest)\nprint('Results of Dickey-Fuller Test:')\nprint(dfoutput)\n\n\n# ## Features Correlations\n\n\ncorr = financial_data.corr()\ncorr.style.background_gradient(cmap='coolwarm')\n\n\ncorr = financial_data.corr()\ncorr.style.background_gradient(cmap='coolwarm')\n\n\nfinancial_data.head()\n\n\n\n", "project_metadata": {"full_name": "ganevniko/SP500-SPY-ETF-Daily-Price-Prediction", "description": "It is well known that the stock market exhibits very high dimensionality due to the almost unlimited number of factors that can affect it which makes it very difficult to predict. Studying how global stock market indexes respond to headlines can provide a major advantage in predicting stock movements and making trade decisions. Naturally, fundamental and technical indicators are not to be neglected and the goal of the project is to combine all of these aspects to achieve a model that thinks as an experienced trader.", "topics": ["python", "data-science", "keras", "scikit-learn", "aws", "vizualisation", "candlestick-chart", "deep-learning", "machine-learning", "artificial-intelligence", "dimensionality-reduction", "principal-component-analysis", "sentiment-analysis", "arima-model"], "git_url": "git://github.com/ganevniko/SP500-SPY-ETF-Daily-Price-Prediction.git", "stars": 7, "watchers": 7, "forks": 1, "created": "2019-09-09T21:21:48Z", "size": 7176, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6596878, "Python": 1}, "last_updated": "2020-12-05T20:37:56Z"}, "intent": "# add ticker as suffix to all columns"}, {"original_comment": "# ## Creating Dataframe for applying transformations\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Import required packages\n\n#%%\n\nfrom sklearn.metrics import classification_report\nfrom skmultilearn.problem_transform import LabelPowerset\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import BaggingClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss\nfrom sklearn.cluster import KMeans\nfrom yellowbrick.text import UMAPVisualizer\nfrom yellowbrick.text import FreqDistVisualizer\nfrom sklearn import metrics\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.preprocessing import MultiLabelBinarizer\nimport warnings\nfrom yellowbrick.text import TSNEVisualizer\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\nimport pandas as pd\nfrom nltk.corpus import reuters\nimport nltk\nnltk.download('wordnet')\nwarnings.filterwarnings(\"ignore\")\n\n\n# ## Getting train and test dataset from nltk reuters corpus\n\n#%%\n\ntrain_documents, train_categories = zip(\n *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])\ntest_documents, test_categories = zip(\n *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])\n\n#%%\n\nprint(\"Number of training documents:\", len(train_documents))\nprint(\"Number of testing documents:\", len(test_documents))\n\n\n# # Convert the categorical labels to Multi Label Encodings\n\n#%%\n\nmlb = MultiLabelBinarizer()\ntrain_labels = mlb.fit_transform(train_categories)\ntest_labels = mlb.transform(test_categories)", "target_code": "import pandas as pd\n\ntrainData = {\"content\": train_documents}\ntestData = {\"content\": test_documents}\ntrainDf = pd.DataFrame(trainData, columns=[\"content\"])\ntestDf = pd.DataFrame(testData, columns=[\"content\"])\n", "context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Import required packages\n\n\nfrom sklearn.metrics import classification_report\nfrom skmultilearn.problem_transform import LabelPowerset\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import BaggingClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss\nfrom sklearn.cluster import KMeans\nfrom yellowbrick.text import UMAPVisualizer\nfrom yellowbrick.text import FreqDistVisualizer\nfrom sklearn import metrics\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.preprocessing import MultiLabelBinarizer\nimport warnings\nfrom yellowbrick.text import TSNEVisualizer\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.corpus import reuters\nimport nltk\nnltk.download('wordnet')\nwarnings.filterwarnings(\"ignore\")\n\n\n# ## Getting train and test dataset from nltk reuters corpus\n\n\ntrain_documents, train_categories = zip(\n *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])\ntest_documents, test_categories = zip(\n *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])\n\n\nprint(\"Number of training documents:\", len(train_documents))\nprint(\"Number of testing documents:\", len(test_documents))\n\n\n# # Convert the categorical labels to Multi Label Encodings\n\n\nmlb = MultiLabelBinarizer()\ntrain_labels = mlb.fit_transform(train_categories)\ntest_labels = mlb.transform(test_categories)\n\n\n\n", "project_metadata": {"full_name": "MageshDominator/Blog_References", "description": "Code and content references for TechnovativeThinker blog posts", "topics": [], "git_url": "git://github.com/MageshDominator/Blog_References.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2019-12-25T13:53:00Z", "size": 447, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 555941, "Python": 233259, "Shell": 56}, "last_updated": "2020-11-15T10:28:03Z"}, "intent": "# create dataframe from documents"}]