code_block,too_long,marks,graph_vertex_id
"# My forecasting COVID-19 confirmed cases and fatalities between March 19 and April 30 
# My submission scored 0.52281

import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# model
from catboost import Pool
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

#plot
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,2,45.0
"# load training and testing data 
subm = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')
train_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv', index_col='Id', parse_dates=True)
test_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv', index_col='ForecastId', parse_dates=True)",No,5,45.0
subm,No,5,41.0
"# see testing data
test_data",No,5,41.0
"# ...and training data
train_data",No,5,41.0
train_data.describe(),No,5,40.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,3,22.0
train_data.describe(include=['O']),No,5,40.0
test_data.describe(),No,5,40.0
test_data.describe(include=['O']),No,5,40.0
train_data.shape,No,5,58.0
test_data.shape,No,5,58.0
"# detect missing values in training
train_data.isna().sum()",No,5,39.0
"# ...in testing data
test_data.isna().sum()",No,5,39.0
"#metric

def RMSLE(pred,actual):
        return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))",No,4,28.0
"#Convert data in integer
train_data[Date]= pd.to_datetime(train_data[Date]).dt.strftime(""%m%d"").astype(int)
test_data[Date]= pd.to_datetime(test_data[Date]).dt.strftime(""%m%d"").astype(int)",No,5,16.0
"# separate the vector correct answers ('ConfirmedCases' and 'Fatalities') from the training data
train_data.dropna(axis=0, subset=['ConfirmedCases', 'Fatalities'], inplace=True)
y_conf = train_data.ConfirmedCases
train_data.drop(['ConfirmedCases'], axis=1, inplace=True)
y_fatal = train_data.Fatalities
train_data.drop(['Fatalities'], axis=1, inplace=True)",No,4,17.0
"# Select categorical columns in training and testing data
categorical_cols = [cname for cname in train_data.columns if
                    train_data[cname].dtype == ""object""]",No,5,77.0
"# replace missing values in training and testing data
# as we saw above, the data are absent only in 'Province/State'
train_data.fillna('-', inplace=True)
test_data.fillna('-',inplace=True)",No,5,17.0
"# perform LabelEncoder with categorical data (categorical_cols)
encodering = LabelEncoder()

encod_train_data = train_data.copy()
encod_test_data = test_data.copy()
for col in categorical_cols:
    encod_train_data[col] = encodering.fit_transform(train_data[col])
    encod_test_data[col] = encodering.fit_transform(test_data[col])",No,4,7.0
"# split encod_train_data into training(X_train) and validation(X_valid) data
# and split vector correct answers ('ConfirmedCases')
X_train, X_valid, y_train, y_valid = train_test_split(encod_train_data, y_conf, train_size=0.8, 
                                                      test_size=0.2, random_state=0)",No,5,13.0
"# determine the best metrics for the model
def get_score(n_estimators):
    model = GradientBoostingRegressor(n_estimators=n_estimators)
    scores = cross_val_score(model, X_train, y_train, cv=5)

    return scores.mean()",No,5,84.0
"def rmse_score(n_estimators):
    rmse = np.sqrt(-cross_val_score(GradientBoostingRegressor(n_estimators=n_estimators), X_train, y_train, scoring=""neg_mean_squared_error"", cv = 5))
    return(rmse)",No,5,84.0
"# select model and install parameters
model = CatBoostRegressor(iterations=4000, 
                          depth=9, 
                          learning_rate=0.5, 
                          loss_function='RMSE')",No,5,4.0
"# train the model
model.fit(X_train,y_train)",No,5,7.0
"# preprocessing of validation data, get predictions
preds = model.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))",No,4,27.0
"# make the prediction using the resulting model
preds = model.predict(X_valid)

print('MSE:', mean_squared_error(y_valid, preds))",No,3,48.0
"x_list = [X_train, X_valid]
y_list = [y_train, y_valid]

scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list, y_list)) 
scoring",No,4,49.0
"# get predictions test data
final_preds_conf = model.predict(encod_test_data)",No,5,48.0
"# split encod_train_data into training(X_train) and validation(X_valid) data
# and split vector correct answers ('Fatalities')
X_train_f, X_valid_f, y_train_f, y_valid_f = train_test_split(encod_train_data, y_fatal, train_size=0.8, 
                                                      test_size=0.2, random_state=0)",No,5,13.0
"# train the model
model.fit(X_train_f,y_train_f)",No,5,7.0
"# preprocessing of validation data, get predictions
preds = model.predict(X_valid_f)

print('MAE:', mean_absolute_error(y_valid_f, preds))",No,4,27.0
"# make the prediction using the resulting model
preds = model.predict(X_valid_f)

print('MSE:', mean_squared_error(y_valid_f, preds))",No,3,48.0
"x_list_f = [X_train_f, X_valid_f]
y_list_f = [y_train_f, y_valid_f]

scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list_f, y_list_f)) 
scoring",No,4,49.0
"# get predictions test data
final_preds_fatal = model.predict(encod_test_data)",No,5,48.0
"# and save test predictions to file
output.to_csv('submission.csv', index=False)
print('Complete!')",No,5,25.0
output.tail(30),No,5,41.0
output.describe(),No,5,40.0
!pip install mxnet autogluon,No,5,87.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,5,88.0
"import pandas as pd
import numpy as np
import shutil, os
from autogluon import TabularPrediction as task

directory = '../input/covid19-global-forecasting-week-2/'

label_cases = 'ConfirmedCases' # name of target variable to predict in this competition
label_fatalities = 'Fatalities'
outputdir_cases = 'AGmodels_' + label_cases + '/' # where to store trained models
outputdir_fatalities = 'AGmodels_' + label_fatalities + '/' # where to store trained models

if os.path.exists(outputdir_cases):
    shutil.rmtree(outputdir_cases)

if os.path.exists(outputdir_fatalities):
    shutil.rmtree(outputdir_fatalities)

train_data = task.Dataset(file_path=directory+'train.csv')
train_data.drop([""Id""], axis=1, inplace=True)
log_cases_vals = np.log(train_data[label_cases] + 1)
log_fatalities_vals = np.log(train_data[label_fatalities] + 1)
train_data[label_fatalities] = log_fatalities_vals
train_data[label_cases] = log_cases_vals

train_data_cases = train_data.drop([label_fatalities], axis=1)
train_data_fatalities = train_data.drop([label_cases], axis=1)
train_data.head()'",No,2,45.0
"df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')
df_submit = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0
df_train.head(),No,5,41.0
df_train.info(),No,5,40.0
"print(""Number of Country_Region: "", df_train['Country_Region'].nunique())
print(""Dates are ranging from day"", min(df_train['Date']), ""to day"", max(df_train['Date']), "", a total of"", df_train['Date'].nunique(), ""days"")
print(""The countries that have Province/Region given are : "", df_train[df_train['Province_State'].isna()==False]['Country_Region'].unique())'",No,5,54.0
df_train.columns,No,5,71.0
 df_train['Province_State'].unique(),No,5,57.0
"plt.figure(figsize=(40,40))
temp_df= df_train[df_train['ConfirmedCases']>5000]
sns.barplot(y = temp_df['Country_Region'] , x = temp_df['ConfirmedCases']>10000)
sns.set_context('paper')
plt.ylabel(""Country_Region"",fontsize=30)
plt.xlabel(""Counts"",fontsize=30)
plt.title(""Counts of Countries affected by the pandemic that have confirmed cases > 5000"",fontsize=30)
plt.xticks(rotation = 90)'",No,4,53.0
"confirmed_total_dates = df_train.groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_dates = df_train.groupby(['Date']).agg({'Fatalities':['sum']})
total_dates = confirmed_total_dates.join(fatalities_total_dates)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7))
total_dates.plot(ax=ax1)
ax1.set_title(""Global confirmed cases"", size=13)
ax1.set_ylabel(""Total Number of cases"", size=13)
ax1.set_xlabel(""Date"", size=13)
fatalities_total_dates.plot(ax=ax2, color='orange')
ax2.set_title(""Global deceased cases"", size=13)
ax2.set_ylabel(""Total Number of cases"", size=13)
ax2.set_xlabel(""Date"", size=13)'",No,4,40.0
"italy = df_train[df_train['Country_Region'] == 'Italy']
plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = italy)
plt.xticks(rotation = 90,size=12)
plt.xlabel('Date',size=15)
plt.ylabel('Confirmed Cases',size=15)
plt.title('Confirmed Cases per date in Italy',size=20)
plt.show()",No,5,75.0
"italy = df_train[df_train['Country_Region'] == 'Italy']
plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'Fatalities' , data = italy,color='orange')
plt.xticks(rotation = 90,size=12)
plt.xlabel('Date',size=15)
plt.ylabel('Fatalities',size=15)
plt.title('Fatalities in Italy per Date',size=20)
plt.show()",No,5,75.0
"usa = df_train[df_train['Country_Region'] == 'US']
plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = usa,color='g')
plt.xticks(rotation = 90,size=13)
plt.xlabel('Date',size=15)
plt.ylabel('Confirmed Cases',size=15)
plt.title('Confirmed Cases in US per Date',size=20)
plt.show()",No,5,75.0
"plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'Fatalities' , data = usa,color='purple')
plt.title('Fatalities in US per Date',size=20)
plt.xticks(rotation = 90,size=13)
plt.xlabel('Date',size=15)
plt.ylabel('Fatalities',size=15)
plt.show()",No,5,75.0
"plt.figure(figsize=(20,10))
sns.barplot(x='Province_State',y='ConfirmedCases',data=usa,ci=None)
plt.xticks(rotation = 90,size=13)
plt.xlabel('Province_State',size=15)
plt.ylabel('Confirmed Cases',size=15)
plt.title('Confirmed Cases in US Province_State ',size=20)
plt.show()",No,5,33.0
"#we now do the analysis of NYC as per week.
import warnings
warnings.filterwarnings('ignore')
temp_df = usa[usa['Province_State'] == 'New York']
temp_df['Date'] = pd.to_datetime(temp_df['Date'])
temp_df.insert(6,'Week',temp_df['Date'].dt.week)
f,axes = plt.subplots(1,2,figsize=(12,5))
sns.lineplot(x = 'Week',y = 'ConfirmedCases',color='r',data=temp_df,ax = axes[0])
sns.lineplot(x = 'Week',y = 'Fatalities',color='b',data=temp_df,ax = axes[1])

axes[0].title.set_text('Confirmed Cases in NYC per week')
axes[1].title.set_text('Fatalities in NYC per week')",No,4,14.0
"china  = df_train[df_train['Country_Region'] == 'China']

plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = china,color='aqua')
plt.xticks(rotation = 90,size=12)
plt.xlabel('Date',size=15)
plt.ylabel('Confirmed Cases',size=15)
sns.set_context('paper')
plt.title('Confirmed Cases in China per Date',size=20)
plt.show()",No,5,75.0
"china  = df_train[df_train['Country_Region'] == 'China']

plt.figure(figsize=(20,10))
sns.lineplot(x = 'Date' , y = 'Fatalities' , data = china,color='grey')
plt.xticks(rotation = 90,size=12)
plt.xlabel('Date',size=15)
plt.ylabel('Fatalities',size=15)
sns.set_context('paper')
plt.title('Fatalities in China per Date',size=20)
plt.show()",No,5,75.0
"plt.figure(figsize=(20,10))
sns.barplot(x='Province_State',y='ConfirmedCases',data=china)
plt.xticks(rotation = 90,size=13)
plt.title('Confirmed Cases in China Province_State',size=20)
plt.ylabel('Confirmed Cases',size=15)
plt.xlabel('Province_State',size=15)
plt.show()",No,5,33.0
"#we now do the analysis of Hubei as per week.
import warnings
warnings.filterwarnings('ignore')
china_t = china[china['Province_State'] == 'Hubei']
china_t['Date'] = pd.to_datetime(china_t['Date'])
china_t.insert(6,'Week',china_t['Date'].dt.week)
f,axes = plt.subplots(1,2,figsize=(12,5))
sns.lineplot(x = 'Week',y = 'ConfirmedCases',color='r',data=china_t,ax = axes[0])
sns.lineplot(x = 'Week',y = 'Fatalities',color='b',data=china_t,ax = axes[1])

axes[0].title.set_text('Confirmed Cases in Hubei per week')

axes[1].title.set_text('Fatalities in Hubei per week')",No,4,14.0
"df_train = df_train[['Date','Province_State','Country_Region','ConfirmedCases','Fatalities']]
df_train.head()",No,3,10.0
"print(""Read in libraries"")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA
from random import random",No,5,22.0
"import pandas as pd
import numpy as np

# Very big number to be used for a parameter values of some models
BIG_NUMBER = 1000000",No,5,77.0
"train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv', na_filter=False)
test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv', na_filter=False)
submission_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0
train_df,No,5,41.0
test_df,No,5,41.0
submission_df,No,5,41.0
"train_df = train_df[[""Province_State"", ""Country_Region"", ""Date"", ""ConfirmedCases"", ""Fatalities""]]
train_df",No,5,10.0
"len(set(train_df[""Country_Region""])) == len(set(test_df[""Country_Region""]))",No,3,37.0
"set(train_df[""Country_Region""]) == set(test_df[""Country_Region""])",No,3,37.0
"countries = set(train_df[""Country_Region""])
countries",No,5,77.0
"# This function assumes that the number of confirmed cases/fatalities doubles every n days
# The task is to find the optimal n from curve fitting, separately for cases and fatalities
def func(x, b, n):
    return x * (b ** (1/n))",No,5,53.0
"from scipy.optimize import curve_fit

# Iterate through countries sorted alphabetically from A to Z. 
# As some countries, like USA, China, Canada, UK, Australia, have provinces/states, decend to the province level 
# (i.e., each province within such countries gets its owm model)
for country in sorted(countries):
    print(""Country: "", country)
    # Select information related to the current country
    c_df = train_df[train_df[""Country_Region""] == country]
    # Get a list of country's provinces
    provinces = set(c_df[""Province_State""])
    print(""Provinces: "", provinces)
    # Iterate over provinces
    for province in sorted(provinces):
        # Create a compound name for each country when provinces are present
        if province != """":
            full_country = country + ""-"" + province
        else:
            full_country = country
        
        # From country information, select the current province information
        p_df = c_df[c_df[""Province_State""] == province]
        
        # Prepare data for building a model
        X1 = p_df[p_df[""ConfirmedCases""] > 0][""ConfirmedCases""].values[:-1]  # Omit the last value in order to properly form labels
        y1 = p_df[p_df[""ConfirmedCases""] > 0][""ConfirmedCases""].values[1:]  # Notice that ""labels"" are in fact ""data"" shifted one position to the right
        X2 = p_df[p_df[""Fatalities""] > 0][""Fatalities""].values[:-1]  # Omit the last value in order to properly form labels
        y2 = p_df[p_df[""Fatalities""] > 0][""Fatalities""].values[1:] # Notice that ""labels"" are in fact ""data"" shifted one position to the right
        
        # For confirmed cases, find the optimal value of a model parameter and perform the curve fitting if possible 
        # Treat special cases when either X or y or both contains all zeroes or just one (last) non-zero value!
        if len(X1) > 1 and len(y1) > 1:  # Build a model only if there are two or more non-zero values
            popt, _ = curve_fit(func, X1, y1)
            popt_cases = popt  # there is just one parameter
        else:  
            # otherwise, just set the parameter to a very big number, implying that there would be almost no change in numbers
            popt_cases = 2, BIG_NUMBER
        # Treat the special case if it turned out that the parameter value is zero
        if popt_cases[1] == 0:
            # Set the parameter to a very large value m so that the quantity 2**(1/m) -> 1, which implies that
            # the numbers won't grow
            popt_cases = 2, BIG_NUMBER
        print(""{}: Optimal parameter value for confirmed cases: {}"".format(full_country, popt_cases))
        
        # For fatalities, find the optimal value of a model parameter and perform the curve fitting if possible 
        # Treat special cases when either X or y or both contains all zeroes or just one (last) non-zero value!
        if len(X2) > 1 and len(y2) > 1:
            popt, _ = curve_fit(func, X2, y2)
            popt_fatalities = popt  # there is just one parameter
        else:
            # otherwise, just set the parameter to a very big number, implying that there would be almost no change in numbers
            popt_fatalities = 2, BIG_NUMBER
        # Treat the special case if it turned out that the parameter value is zero
        if popt_fatalities[1] == 0:
            # Set the parameter to a very large value m so that the quantity 2**(1/m) -> 1, which implies that
            # the numbers won't grow
            popt_fatalities = 2, BIG_NUMBER
        print(""{}: Optimal parameter value for fatalities: {}"".format(full_country, popt_fatalities))
        
        # Select test data for a given country and its province if the latter is given
        condition = (test_df[""Province_State""] == province) & (test_df[""Country_Region""] == country)
        t_df = test_df[condition]
        
        # Get the initial values to be used for generating future values for confirmed cases and fatalities
        last_train_date = t_df[""Date""].values[0]
        print(last_train_date)
        cases = p_df[p_df[""Date""] == last_train_date][""ConfirmedCases""].values[0]
        print(cases)
        fatalities = p_df[p_df[""Date""] == last_train_date][""Fatalities""].values[0]
        print(fatalities)
        
        # It's necessary to drop index in 't_df': otherwise, 't_df.loc[i, ""ForecastId""]' would fail,
        # starting from the second country
        t_df.reset_index(inplace=True, drop=True)
        for i in range(t_df.shape[0]):
            # Get a row index to write to
            idx = t_df.loc[i, ""ForecastId""] - 1
            # make predictions
            cases = round(cases * (popt_cases[0] ** (1/popt_cases[1])), 0)
            submission_df.loc[idx, ""ConfirmedCases""] = cases
            fatalities = round(fatalities * (popt_fatalities[0] ** (1/popt_fatalities[1])), 0)
            submission_df.loc[idx, ""Fatalities""] = fatalities
        
        print(""*""*70)'",No,5,53.0
"submission_df.to_csv(""submission.csv"", index=False, header=True)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2'
df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv')
df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv')
print(""*""*100)
print(df_train.head())
print(""*""*100)
print(df_test.head())
print(""*""*100)'",No,3,45.0
"df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)

df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)

print(""*""*50)
print(df_train.info())
print(""*""*50)
print(df_test.info())
print(""*""*50)'",No,3,16.0
"NULL_VAL = ""NULL_VAL""

def fillState(state, country):
    if state == NULL_VAL: 
        return country
    
    return state

X_Train = df_train.loc[:, ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
X_Train['State'].fillna(NULL_VAL, inplace=True)
X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)
X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(""%m%d"")
X_Train[""Date""]  = X_Train[""Date""].astype(int)

X_Test = df_test.loc[:, ['State', 'Country', 'Date', 'ForecastId']]
X_Test['State'].fillna(NULL_VAL, inplace=True)
X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)
X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(""%m%d"")
X_Test[""Date""]  = X_Test[""Date""].astype(int)

print(""*""*50)
print(X_Train.head())
print(""*""*50)
print(X_Test.head())
print(""*""*50)'",No,3,17.0
"from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X_Train.Country = le.fit_transform(X_Train.Country)
X_Train['State'] = le.fit_transform(X_Train['State'])

X_Test.Country = le.fit_transform(X_Test.Country)
X_Test['State'] = le.fit_transform(X_Test['State'])

print(""*""*50)
print(X_Train.head())
print(""*""*50)
print(X_Test.head())
print(""*""*50)'",No,4,20.0
"from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

countries = X_Train.Country.unique()

df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

for country in countries:
    states = X_Train.loc[X_Train.Country == country, :].State.unique()

    for state in states:
        condition_train = (X_Train.Country == country) & (X_Train.State == state)
       
        # Get X and y (train)
        X_Train_CS  = X_Train.loc[condition_train, ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
        y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']
        y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']
        X_Train_CS  = X_Train_CS.loc[:, ['State', 'Country', 'Date']]

        # Get X and y (test)
        condition_test = (X_Test.Country == country) & (X_Test.State == state)
        X_Test_CS = X_Test.loc[condition_test, ['State', 'Country', 'Date', 'ForecastId']]
        
        # Save forcast id for submission
        X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']
        X_Test_CS    = X_Test_CS.loc[:, ['State', 'Country', 'Date']]
        
        model1 = XGBRegressor(n_estimators=1000)
        model1.fit(X_Train_CS, y1_Train_CS)
        y1_pred = model1.predict(X_Test_CS)
        
        model2 = XGBRegressor(n_estimators=1000)
        model2.fit(X_Train_CS, y2_Train_CS)
        y2_pred = model2.predict(X_Test_CS)
        
        df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})
        df_out = pd.concat([df_out, df], axis=0)
    # Done for state loop
# Done for country Loop

df_out.ForecastId = df_out.ForecastId.astype('int')
df_out.tail()",No,3,7.0
"df_out.to_csv('submission.csv', index=False)",No,5,25.0
"import numpy as np 
import pandas as pd 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,5,88.0
"PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2'
df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv')
df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv')",No,5,45.0
df_test.head(),No,5,41.0
"df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)

df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)",No,5,61.0
"df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)",No,5,16.0
df_test.info(),No,5,40.0
"y1_Train = df_train.iloc[:, -2]
y1_Train.head()",No,3,21.0
"y2_Train = df_train.iloc[:, -1]
y2_Train.head()",No,4,14.0
"EMPTY_VAL = ""EMPTY_VAL""

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state",No,5,53.0
"#X_Train = df_train.loc[:, ['State', 'Country', 'Date']]
X_Train = df_train.copy()

X_Train['State'].fillna(EMPTY_VAL, inplace=True)
X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)

X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(""%m%d"")
X_Train[""Date""]  = X_Train[""Date""].astype(int)

X_Train.head()'",No,3,17.0
"#X_Test = df_test.loc[:, ['State', 'Country', 'Date']]
X_Test = df_test.copy()

X_Test['State'].fillna(EMPTY_VAL, inplace=True)
X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)

X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(""%m%d"")
X_Test[""Date""]  = X_Test[""Date""].astype(int)

X_Test.head()'",No,4,17.0
"from sklearn import preprocessing

le = preprocessing.LabelEncoder()",No,4,22.0
"X_Train.Country = le.fit_transform(X_Train.Country)
X_Train['State'] = le.fit_transform(X_Train['State'])

X_Train.head()",No,5,20.0
"X_Test.Country = le.fit_transform(X_Test.Country)
X_Test['State'] = le.fit_transform(X_Test['State'])

X_Test.head()",No,5,20.0
"df_train.loc[df_train.Country == 'Afghanistan', :]",No,5,14.0
df_test.tail(),No,5,41.0
"from warnings import filterwarnings
filterwarnings('ignore')",No,5,23.0
le = preprocessing.LabelEncoder(),No,5,20.0
"from xgboost import XGBRegressor
import lightgbm as lgb",No,5,22.0
"countries = X_Train.Country.unique()

df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})
df_out2 = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

for country in countries:
    states = X_Train.loc[X_Train.Country == country, :].State.unique()
    #print(country, states)
    # check whether string is nan or not
    for state in states:
        X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
        
        y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']
        y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']
        
        X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]
        
        X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)
        X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])
        
        X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]
        
        X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']
        X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]
        
        X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)
        X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])
        
        # XGBoost
        model1 = XGBRegressor(n_estimators=2000)
        model1.fit(X_Train_CS, y1_Train_CS)
        y1_pred = model1.predict(X_Test_CS)
        
        model2 = XGBRegressor(n_estimators=2000)
        model2.fit(X_Train_CS, y2_Train_CS)
        y2_pred = model2.predict(X_Test_CS)
        
        # LightGBM
        model3 = lgb.LGBMRegressor(n_estimators=2000)
        model3.fit(X_Train_CS, y1_Train_CS)
        y3_pred = model3.predict(X_Test_CS)
        
        model4 = lgb.LGBMRegressor(n_estimators=2000)
        model4.fit(X_Train_CS, y2_Train_CS)
        y4_pred = model4.predict(X_Test_CS)
        
        df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})
        df2 = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y3_pred, 'Fatalities': y4_pred})
        df_out = pd.concat([df_out, df], axis=0)
        df_out2 = pd.concat([df_out2, df2], axis=0)
    # Done for state loop
# Done for country Loop",No,3,7.0
"df_out.ForecastId = df_out.ForecastId.astype('int')
df_out2.ForecastId = df_out2.ForecastId.astype('int')",No,5,16.0
"df_out['ConfirmedCases'] = (1/2)*(df_out['ConfirmedCases'] + df_out2['ConfirmedCases'])
df_out['Fatalities'] = (1/2)*(df_out['Fatalities'] + df_out2['Fatalities'])",No,5,8.0
"df_out['ConfirmedCases'] = df_out['ConfirmedCases'].round().astype(int)
df_out['Fatalities'] = df_out['Fatalities'].round().astype(int)",No,5,16.0
df_out.tail(),No,5,41.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns

# Any results you write to the current directory are saved as output.",No,5,88.0
"train_df = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
test_df = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")",No,5,45.0
train_df.head(50),No,5,41.0
"'''
from pandas_profiling import ProfileReport
train_profile = ProfileReport(train_df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
train_profile
'''",No,3,22.0
train_df.info(),No,5,40.0
"common_value = ""UNKNOWN""

# Replacing all the Province_State that are null by the Country_Region values
train_df.Province_State.fillna(train_df.Country_Region, inplace=True)
test_df.Province_State.fillna(test_df.Country_Region, inplace=True)

# Handling the Date column
# 1. Converting the object type column into datetime type
train_df.Date = train_df.Date.apply(pd.to_datetime)
test_df.Date = test_df.Date.apply(pd.to_datetime)

# 2. Creating new features
#train_df['ReportDay_year'] = train_df['Date'].dt.year #Not required this column because all the data is of this year
train_df['ReportDay_month'] = train_df['Date'].dt.month
train_df['ReportDay_week'] = train_df['Date'].dt.week
train_df['ReportDay_day'] = train_df['Date'].dt.day 

#test_df['ReportDay_year'] = test_df['Date'].dt.year
test_df['ReportDay_month'] = test_df['Date'].dt.month
test_df['ReportDay_week'] = test_df['Date'].dt.week
test_df['ReportDay_day'] = test_df['Date'].dt.day'",No,4,8.0
"#Dropping the date column
train_df.drop(""Date"", inplace = True, axis = 1)
test_df.drop(""Date"", inplace = True, axis = 1)",No,5,10.0
train_df.Province_State.value_counts(),No,5,72.0
"
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df.Country_Region = le.fit_transform(train_df.Country_Region)
train_df['Province_State'] = le.fit_transform(train_df['Province_State'])

test_df.Country_Region = le.fit_transform(test_df.Country_Region)
test_df['Province_State'] = le.fit_transform(test_df['Province_State'])
",No,5,20.0
\,No,3,20.0
"'''
# Removing duplicate entries
train_df = train_df.loc[:,~train_df.columns.duplicated()]
test_df = test_df.loc[:,~test_df.columns.duplicated()]
print (test_df.shape)
'''",No,4,19.0
"'''
# Dropping the object type columns
train_df.drop(objList, axis=1, inplace=True)
test_df.drop(objList, axis=1, inplace=True)
print (train_df.shape)
'''",No,4,10.0
\,No,5,71.0
test_df.info(),No,5,40.0
"X_train = train_df.drop([""Id"", ""ConfirmedCases"", ""Fatalities""], axis = 1)

Y_train_CC = train_df[""ConfirmedCases""] 
Y_train_Fat = train_df[""Fatalities""] 

#X_test = test_df.drop([""ForecastId""], axis = 1)
X_test = test_df.drop([""ForecastId""], axis = 1) ",No,5,21.0
\,No,5,28.0
"from sklearn.model_selection import ShuffleSplit, cross_val_score
skfold = ShuffleSplit(random_state=7)",No,5,84.0
"
#1.Ridge Regression

#Model import 

from sklearn.linear_model import Ridge

#train classifier
reg_CC = Ridge(alpha=1.0)
reg_Fat = Ridge(alpha=1.0)

#Cross Validation to calculate the score
score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold)
score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold)

#rmsle_svm = test_model_r2(clf_svm, ""CC"")

#Print the scores
print (score_CC.mean(), score_Fat.mean())
",No,5,84.0
"
#2.Lasso Regression

#Model import 

from sklearn import linear_model

#train classifier
reg_CC = linear_model.Lasso(alpha=0.1)
reg_Fat = linear_model.Lasso(alpha=0.1)

#Cross Validation to calculate the score
score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold)
score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold)

#rmsle_svm = test_model_r2(clf_svm, ""CC"")

#Print the scores
print (score_CC.mean(), score_Fat.mean())
",No,5,28.0
"
#3. SVM

#Model import 

from sklearn import svm

#train classifier
reg_CC = svm.SVC()
reg_Fat = svm.SVC()

#Cross Validation to calculate the score
score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold)
score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold)

#Print the scores
print (score_CC.mean(), score_Fat.mean())
",No,5,28.0
"
#3. ElasticNet

#Model import 
from sklearn.linear_model import ElasticNet

#train classifier
reg_CC = ElasticNet(random_state=0)
reg_Fat = ElasticNet(random_state=0)

#Cross Validation to calculate the score
score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold)
score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold)

#Print the scores
print (score_CC.mean(), score_Fat.mean())
",No,5,28.0
"
#5. LinearRegression

#Model import 

from sklearn.linear_model import LinearRegression

#train classifier
reg_CC = LinearRegression()
reg_Fat = LinearRegression()

#Cross Validation to calculate the score
score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold)
score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold)

#Print the scores
print (score_CC.mean(), score_Fat.mean())

",No,5,28.0
\,No,5,28.0
\,No,5,28.0
\,No,3,28.0
\,No,5,28.0
"
#5. BaggingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

clf_bgr_CC = BaggingRegressor(base_estimator = DecisionTreeRegressor())
clf_bgr_Fat = BaggingRegressor(base_estimator = DecisionTreeRegressor())

rmsle_bgr_CC = test_model(clf_bgr_CC, ""CC"")
rmsle_bgr_Fat = test_model(clf_bgr_Fat, ""Fat"")

print (rmsle_bgr_CC, rmsle_bgr_Fat)
",No,5,84.0
\,No,4,28.0
"reg_CC.fit(X_train, Y_train_CC)
Y_pred_CC = reg_CC.predict(X_test) 

reg_Fat.fit(X_train, Y_train_Fat)
Y_pred_Fat = reg_Fat.predict(X_test) ",No,5,48.0
print (Y_pred_Fat),No,5,53.0
"#Using pd.to_datetime for adding new features
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_train.insert(1,'Week',df_train['Date'].dt.week)
df_train.insert(2,'Day',df_train['Date'].dt.day)
df_train.insert(3,'DayofWeek',df_train['Date'].dt.dayofweek)
df_train.insert(4,'DayofYear',df_train['Date'].dt.dayofyear)

df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test.insert(1,'Week',df_test['Date'].dt.week)
df_test.insert(2,'Day',df_test['Date'].dt.day)
df_test.insert(3,'DayofWeek',df_test['Date'].dt.dayofweek)
df_test.insert(4,'DayofYear',df_test['Date'].dt.dayofyear)",No,5,8.0
"# Replacing all the Province_State that are null by the Country_Region values
df_train.Province_State.fillna(df_train.Country_Region, inplace=True)
df_test.Province_State.fillna(df_test.Country_Region, inplace=True)",No,5,17.0
"from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_train.Country_Region = le.fit_transform(df_train.Country_Region)
df_train['Province_State'] = le.fit_transform(df_train['Province_State'])

df_test.Country_Region = le.fit_transform(df_test.Country_Region)
df_test['Province_State'] = le.fit_transform(df_test['Province_State'])
",No,5,20.0
"#One Hot Encoding columns
def one_hot(df, cols):
    """"""
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """"""
    i = 0
    for each in cols:
        #print (each)
        dummies = pd.get_dummies(df[each], prefix=each, drop_first= True)
        if i == 0: 
            print (dummies)
            i = i + 1
        df = pd.concat([df, dummies], axis=1)
    return df",No,5,20.0
"#Handling categorical data

objList = df_train.select_dtypes(include = ""object"").columns
df_train = one_hot(df_train, objList) 
df_test = one_hot(df_test, objList) 

print (df_train.shape)",No,4,8.0
"#Avoiding duplicated data.
df_train = df_train.loc[:,~df_train.columns.duplicated()]
df_test = df_test.loc[:,~df_test.columns.duplicated()]
print (df_test.shape)",No,4,19.0
"#reading data
data = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv')
test_data = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv')
submission = pd.read_csv('../input/covid19-global-forecasting-week-2/submission.csv')
print(data.shape)
print(test_data.shape)
print(submission.shape)",No,4,45.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,4,22.0
"dftrain =  pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
dftrain[""GrowthRate""] = dftrain[""ConfirmedCases""] / dftrain.ConfirmedCases.shift(1)
print(dftrain.Date.unique())
dftest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
print(dftest.columns.values)
dfsubmission  = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/submission.csv"")",No,3,45.0
"growth_per_country = {}
means_per_country = {}
last_per_country = {}
def train(country, region, growth):
    if growth < 2.0:
        print(growth)
        growth_list = growth_per_country.get((country,region), list())
        growth_list.append(growth)
        growth_per_country[(country,region)] = growth_list
        

def predict(country, region):
    if not (country,region) in means_per_country:
        means_per_country[(country,region)] = np.mean(growth_per_country[(country,region)])
    growth = means_per_country[(country,region)]
    return growth",No,2,7.0
"## Training
for row in dftrain.itertuples():
    train(row.Country_Region, row.Province_State, row.GrowthRate)",No,3,7.0
"## Current submission
for row in dftest.itertuples():
    if(row.ForecastId%100 == 0):
        print(row.ForecastId)
    if type(row.Province_State)!=str:
        dfnow = dftrain[dftrain.Country_Region == row.Country_Region]
    else:
        dfnow = dftrain[dftrain.Country_Region == row.Country_Region][dftrain.Province_State == row.Province_State]
    filterDate = dfnow[""Date""].isin([row.Date])
    if len(dfnow[filterDate].values) == 0:
        growth = predict(row.Country_Region, row.Province_State)
        pred = pred * growth
        predfat = predfat * abs(growth - 0.1)
    else:
        pred = dfnow[filterDate][""ConfirmedCases""].values[0]
        predfat = dfnow[filterDate][""Fatalities""].values[0]
    dfsubmission.at[row.ForecastId-1, ""ConfirmedCases""] = int(pred)
    dfsubmission.at[row.ForecastId-1, ""Fatalities""] = int(predfat)
dfsubmission
dfsubmission.to_csv('submission.csv', index=False)  
# end, the rest is experimental code'",No,3,27.0
print(dfsubmission.head(60)),No,5,41.0
"dfsubmission[""Date""] = dftest[""Date""]
dfsub = dfsubmission[dftest.Country_Region == 'Ireland'][dftest.Province_State.isnull()]
dfsub[:20].plot(""Date"", ""ConfirmedCases"")'",No,3,33.0
"df = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
df.head(40)
print(df.columns.values)
df[""GrowthRate""] = df[""ConfirmedCases""] / df.ConfirmedCases.shift(1)
df[""Growth""] = df[""ConfirmedCases""] - df.ConfirmedCases.shift(1)
df[""PredictedCasesByRate""] =  df[""ConfirmedCases""].shift(1) * df.GrowthRate.shift(1)
df[""PredictedCases""] = df[""ConfirmedCases""].shift(1) + df.Growth.shift(1)
df[""ErrorByRate""] =  (df.ConfirmedCases-df.PredictedCasesByRate)/df.ConfirmedCases
df[""Error""] = (df.ConfirmedCases - df.PredictedCases)/df.ConfirmedCases 
df[""FGrowth""] = df[""Fatalities""]/df.Fatalities.shift(1)
print(df.head())

dff = df[df.Country_Region == 'Italy'][df.Province_State.isnull()][df.ConfirmedCases >= 100]
dff.plot(""Date"", [""ConfirmedCases"",""PredictedCases"", ""PredictedCasesByRate""])
dff.plot(""Date"", [""ErrorByRate"", ""Error""])
dff.plot(""Date"", ""GrowthRate"")
dff.head(15)'",No,5,53.0
"dftest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
dftest.head(40)",No,4,45.0
"dfg = df.groupby([df.Country_Region, df.Province_State])
dfg.head()",No,3,60.0
"import numpy as np
import pandas as pd

df = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
df.shape",No,3,45.0
"loc_group = [""Province_State"", ""Country_Region""]


def preprocess(df):
    df[""Date""] = df[""Date""].astype(""datetime64[ms]"")
    for col in loc_group:
        df[col].fillna(""none"", inplace=True)
    return df

df = preprocess(df)
sub_df = preprocess(pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv""))
df.head()",No,4,45.0
"# Dropping the object type columns
df_train.drop(objList, axis=1, inplace=True)
df_test.drop(objList, axis=1, inplace=True)
print (df_train.shape)",No,4,10.0
"df[""Date""].min(), df[""Date""].max()",No,5,40.0
"TARGETS = [""ConfirmedCases"", ""Fatalities""]

for col in TARGETS:
    df[col] = np.log1p(df[col])",No,5,8.0
"for col in TARGETS:
    df[""prev_{}"".format(col)] = df.groupby(loc_group)[col].shift()",No,5,60.0
"df = df[df[""Date""] > df[""Date""].min()].copy()
df.head()",No,4,14.0
df_train,No,5,41.0
"X = df_train.drop(['Date', 'ConfirmedCases', 'Fatalities'], axis=1)
y = df_train[['ConfirmedCases', 'Fatalities']]",No,5,21.0
"from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, r2_score, mean_squared_log_error
from sklearn.ensemble import BaggingRegressor",No,5,22.0
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)",No,5,13.0
y_train.head(),No,5,41.0
"n_folds = 5
cv = KFold(n_splits = 5, shuffle=True, random_state=42).get_n_splits(X_train.values)",No,5,84.0
"def predict_scores(reg_alg):
    r2 = make_scorer(r2_score)
    m = reg_alg()
    m.fit(X_train, y_train['ConfirmedCases'])
    y_pred = m.predict(X_test)
    m_r = cross_val_score(m, X_train, y_train['ConfirmedCases'], cv=cv, scoring = r2)
    sc_Cases.append(m_r)
    
    m.fit(X_train, y_train['Fatalities'])
    y_pred = m.predict(X_test)
    m_r2 = cross_val_score(m, X_train, y_train['Fatalities'], cv=cv, scoring = r2)
    sc_Fatalities.append(m_r2)


reg_models = [KNeighborsRegressor, LinearRegression, RandomForestRegressor, GradientBoostingRegressor, DecisionTreeRegressor,BayesianRidge,
              BaggingRegressor]

sc_Cases = []
sc_Fatalities = []

for x in reg_models:
    predict_scores(x)",No,5,3.0
sc_Cases,No,5,53.0
sc_Fatalities,No,5,53.0
from sklearn.ensemble import BaggingRegressor,No,5,22.0
"
#Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

param_grid = {
              'n_estimators':[10, 30, 50, 100,250,500,750,1000,1250,1500,1750],
              'max_samples':[2,4,6,8,10,20,40,60,100],
              ""max_features"": [0.5, 1.0],
              'n_jobs':[-2, -1, 1, 2, 3, 4, 5],
              ""bootstrap_features"": [True, False]
             }
'''param_grid = {""criterion"": [""mae""],
              ""min_samples_split"": [10, 20, 40],
              ""max_depth"": [2, 6, 8],
              ""min_samples_leaf"": [20, 40, 100],
              ""max_leaf_nodes"": [5, 20, 100],
              }'''

asdf = BaggingRegressor()


clf_CC = RandomizedSearchCV(asdf, param_grid )
clf_Fat = RandomizedSearchCV(asdf, param_grid )

clf_CC.fit(X_train, y_train['ConfirmedCases'])
clf_Fat.fit(X_train, y_train['Fatalities'])
'",No,5,6.0
"model1 = clf_CC
model1.fit(X_train, y_train['ConfirmedCases'])

model2 = clf_Fat
model2.fit(X_train, y_train['Fatalities'])",No,5,7.0
"df_test['ConfirmedCases'] = model1.predict(df_test.drop(['Date', 'ForecastId'], axis=1))
df_test['Fatalities'] = model2.predict(df_test.drop(['Date', 'ForecastId', 'ConfirmedCases'], axis=1))",No,5,48.0
"import warnings
warnings.filterwarnings('ignore')
df_results = df_test[['ForecastId', 'ConfirmedCases', 'Fatalities']] 
df_results['ConfirmedCases'] = df_results['ConfirmedCases'].astype(int)
df_results['Fatalities'] = df_results['Fatalities'].astype(int)

df_results.head()",No,4,16.0
"df_results.to_csv('submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import multiprocessing
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
mem_gib = mem_bytes/(1024.**3)  # e.g. 3.74
print(""RAM: %f GB"" % mem_gib)
print(""CORES: %d"" % multiprocessing.cpu_count())

# Any results you write to the current directory are saved as output.",No,5,88.0
"import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from datetime import datetime
from pathlib import Path
from sklearn import preprocessing
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, MinMaxScaler",No,5,22.0
"train = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
test = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
train.tail()",No,4,45.0
test.tail(),No,5,41.0
train.info(),No,5,40.0
"train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
train['Country_Region'] = train['Country_Region'].astype(str)
# train['Province_State'] = train['Province_State'].astype(str)
test['Country_Region'] = test['Country_Region'].astype(str)
# test['Province_State'] = test['Province_State'].astype(str)",No,5,16.0
"EMPTY_VAL = ""EMPTY_VAL""

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state


train['Province_State'].fillna(EMPTY_VAL, inplace=True)
train['Province_State'] = train.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1)

test['Province_State'].fillna(EMPTY_VAL, inplace=True)
test['Province_State'] = test.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1)'",No,5,17.0
"le = preprocessing.LabelEncoder()
train['country_encoder'] = le.fit_transform(train['Country_Region'])
train['date_int'] = train['Date'].apply(lambda x: datetime.strftime(x, '%m%d')).astype(int)

test['country_encoder'] = le.transform(test['Country_Region'])
test['date_int'] = test['Date'].apply(lambda x: datetime.strftime(x, '%m%d')).astype(int)",No,4,20.0
"le = preprocessing.LabelEncoder()
train['province_encoder'] = le.fit_transform(train['Province_State'])
test['province_encoder'] = le.transform(test['Province_State'])",No,5,20.0
"#TODO: takes 44m ish, consider multi-processing, multi-cores, run in GPU
#TODO: create data_generate func
from joblib import parallel_backend
start_time = time.time()

country = train['Country_Region'].drop_duplicates()
train_df = train.copy()
train_df.rename(columns={'Date': 'date', 'ConfirmedCases': 'cc_cases', 'Fatalities': 'ft_cases', 'Country_Region': 'country', 'Province_State': 'province'}, inplace=True)
lags = np.arange(1,8,1)  # lag of 1 to 7

with parallel_backend('threading', n_jobs = -1):
    with tqdm(total = len(list(train_df['date'].unique()))) as pbar:
        for d in train_df['date'].drop_duplicates():
            for i in country:
                province = train_df[train_df['country'] == i]['province'].drop_duplicates()
                for j in province:
                    mask = (train_df['date'] == d) & (train_df['country'] == i) & (train_df['province'] == j)            
                    for lag in lags:
                        mask_org = (train_df['date'] == (d - pd.Timedelta(days=lag))) & (train_df['country'] == i) & (train_df['province'] == j)
                        try:
                            train_df.loc[mask, 'cc_cases_' + str(lag)] = train_df.loc[mask_org, 'cc_cases'].values
                        except:
                            train_df.loc[mask, 'cc_cases_' + str(lag)] = 0

                        try:
                            train_df.loc[mask, 'ft_cases_' + str(lag)] = train_df.loc[mask_org, 'ft_cases'].values
                        except:
                            train_df.loc[mask, 'ft_cases_' + str(lag)] = 0
            pbar.update(1)
print('Time spent for building features is {} minutes'.format(round((time.time()-start_time)/60,1)))",No,4,8.0
"# train_df.to_csv(Path('/kaggle/working', 'train_df.csv')) 
# saved locally, reload it
train_df = pd.read_csv(Path('/kaggle/working/', 'train_df.csv'), index_col = 0, parse_dates = ['date'])
train_df[train_df['country'] == 'Italy'].tail()",No,4,45.0
"#TODO: walk forward validation
def split_train_val(df, val_ratio):
    val_len = int(len(df) * val_ratio)
    train_set =  df[:-val_len]
    val_set = df[-val_len:]
    return train_set, val_set",No,5,13.0
"test_fixed_cols = ['ForecastId', 'Province_State', 'Country_Region', 'Date']
fixed_cols = ['Id', 'province', 'country', 'date']
output_cols = ['cc_cases', 'ft_cases']
input_cols = list(set(train_df.columns.to_list()) - set(fixed_cols) - set(output_cols))
print('output columns are ', output_cols)
print('input columns are ', input_cols)
X = train_df[input_cols]
y = train_df[output_cols]",No,5,21.0
"# split to cumulative and fatal features and build 2 separate models
# split to train and validation set
cc_input = ['cc_cases_1', 'cc_cases_2', 'cc_cases_3', 'cc_cases_4', 'cc_cases_5', 'cc_cases_6', 'cc_cases_7', 'country_encoder', 'province_encoder', 'date_int']
ft_input = ['ft_cases_1', 'ft_cases_2', 'ft_cases_3', 'ft_cases_4', 'ft_cases_5', 'ft_cases_6', 'ft_cases_7', 'country_encoder', 'province_encoder', 'date_int']
cc_output = ['cc_cases']
ft_output = ['ft_cases']
X_cc = X[cc_input]
X_ft = X[ft_input]
y_cc = y[cc_output]
y_ft = y[ft_output]
train_X_cc, val_X_cc = split_train_val(df = X_cc, val_ratio = 0.1)
train_y_cc, val_y_cc = split_train_val(df = y_cc, val_ratio = 0.1)
train_X_ft, val_X_ft = split_train_val(df = X_ft, val_ratio = 0.1)
train_y_ft, val_y_ft = split_train_val(df = y_ft, val_ratio = 0.1)",No,5,13.0
"idx = np.random.RandomState(seed=42).permutation(train_X_cc.index)
train_X_cc = train_X_cc.reindex(idx)
train_y_cc = train_y_cc.reindex(idx)
train_X_ft = train_X_ft.reindex(idx)
train_y_ft = train_y_ft.reindex(idx)
# train_y_cc.tail()",No,5,15.0
"# normalization
X_scaler_cc = MinMaxScaler()
X_train_cc = X_scaler_cc.fit_transform(train_X_cc)
X_val_cc =  X_scaler_cc.transform(val_X_cc) # intput/output 2D array-like

y_scaler_cc = MinMaxScaler()
y_train_cc = y_scaler_cc.fit_transform(train_y_cc)
y_val_cc = y_scaler_cc.transform(val_y_cc) # array-like",No,5,18.0
"X_scaler_ft = MinMaxScaler()
X_train_ft = X_scaler_ft.fit_transform(train_X_ft)
X_val_ft =  X_scaler_ft.transform(val_X_ft) # intput/output 2D array-like

y_scaler_ft = MinMaxScaler()
y_train_ft = y_scaler_ft.fit_transform(train_y_ft)
y_val_ft = y_scaler_ft.transform(val_y_ft) # array-like",No,5,18.0
"print('Validate if train and test is splited correctly for 2 cases: ')
print('cumulative cases training has shape ', X_train_cc.shape, y_train_cc.shape)
print('fatal cases training has shape ', X_train_ft.shape, y_train_ft.shape)
print('cumulative cases valid has shape ', X_val_cc.shape, y_val_cc.shape)
print('fatal cases valid has shape ', X_val_ft.shape, y_val_ft.shape)
#TODO
print('Validate if train and test contains np.nan, np.inf, -np.inf after standardization: ')",No,3,41.0
"# if choose to not apply normalization, however it generates NaN in output...
X_train_cc = train_X_cc.to_numpy()  
X_val_cc = val_X_cc.to_numpy()
X_train_ft = train_X_ft.to_numpy()
X_val_ft = val_X_ft.to_numpy()

y_train_cc = train_y_cc.to_numpy()
y_val_cc = val_y_cc.to_numpy()
y_train_ft = train_y_ft.to_numpy()
y_val_ft = val_y_ft.to_numpy()",No,3,12.0
"# for LSTM, intput.shape = (n_samples, 1, n_features)
X_train_cc = X_train_cc.reshape(X_train_cc.shape[0], 1, X_train_cc.shape[1])
X_val_cc = X_val_cc.reshape(X_val_cc.shape[0], 1, X_val_cc.shape[1])

X_train_ft = X_train_ft.reshape(X_train_ft.shape[0], 1, X_train_ft.shape[1])
X_val_ft = X_val_ft.reshape(X_val_ft.shape[0], 1, X_val_ft.shape[1])
print(X_train_cc.shape, X_val_cc.shape, X_train_ft.shape, X_val_ft.shape)",No,3,12.0
"import pandas as pd
import datetime
import lightgbm as lgb
import numpy as np
from sklearn import preprocessing",No,4,22.0
"train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")
sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")",No,5,45.0
train,No,5,41.0
train = train.append(test[test['Date']>'2020-03-31']),No,5,11.0
"train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')",No,5,16.0
train['day_dist'] = train['Date']-train['Date'].min(),No,5,8.0
train['day_dist'] = train['day_dist'].dt.days,No,5,8.0
"print(train['Date'].max())
#print(val['Date'].max())
print(test['Date'].min())
print(test['Date'].max())
#print(test['Date'].max()-test['Date'].min())",No,5,40.0
"cat_cols = train.dtypes[train.dtypes=='object'].keys()
cat_cols",No,5,77.0
"for cat_col in cat_cols:
    train[cat_col].fillna('no_value', inplace = True)",No,5,17.0
"train['place'] = train['Province_State']+'_'+train['Country_Region']
#vcheck = train[(train['Date']>='2020-03-12')]",No,5,8.0
"for cat_col in ['place']:
    #train[cat_col].fillna('no_value', inplace = True) #train[cat_col].value_counts().idxmax()
    le = preprocessing.LabelEncoder()
    le.fit(train[cat_col])
    train[cat_col]=le.transform(train[cat_col])",No,5,20.0
train.keys(),No,5,40.0
"drop_cols = ['Id','ForecastId', 'ConfirmedCases','Date', 'Fatalities',
             'day_dist', 'Province_State', 'Country_Region'] #,'day_dist','shift_22_ft','shift_23_ft','shift_24_ft','shift_25_ft','shift_26_ft']",No,5,77.0
"#val = train[(train['Id']).isnull()==True]
#train = train[(train['Id']).isnull()==False]
val = train[(train['Date']>='2020-03-12')&(train['Id'].isnull()==False)]
#test = train[(train['Date']>='2020-03-12')&(train['Id'].isnull()==True)]
#train = train[(train['Date']<'2020-03-22')&(train['Id'].isnull()==False)]",No,5,14.0
val,No,5,53.0
"y_ft = train[""Fatalities""]
y_val_ft = val[""Fatalities""]


y_cc = train[""ConfirmedCases""]
y_val_cc = val[""ConfirmedCases""]

#train.drop(drop_cols, axis=1, inplace=True)
#test.drop(drop_cols, axis=1, inplace=True)
#val.drop(drop_cols, axis=1, inplace=True)",No,5,21.0
"#
def rmsle (y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))'",No,5,84.0
"def mape (y_true, y_pred):
    return np.mean(np.abs(y_pred -y_true)*100/(y_true+1))",No,5,84.0
dates = dates[dates>'2020-03-31'],No,5,14.0
train[train['Date']==date],No,5,14.0
test[test['Country_Region']=='Italy'],No,5,14.0
test[(test['Country_Region']=='China')&(test['Province_State']=='Zhejiang')],No,5,14.0
y_pred.mean(),No,5,40.0
print(len(test)),No,5,58.0
"train_sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")",No,5,45.0
test.head(),No,5,41.0
test.loc[test['ConfirmedCases_x'].isnull()==True],No,5,14.0
"test.loc[test['ConfirmedCases_x'].isnull()==True, 'ConfirmedCases_x'] =test.loc[test['ConfirmedCases_x'].isnull()==True, 'ConfirmedCases_y']",No,5,14.0
"test.loc[test['Fatalities_x'].isnull()==True, 'Fatalities_x'] = test.loc[test['Fatalities_x'].isnull()==True, 'Fatalities_y']",No,5,14.0
dates,No,5,53.0
"last_amount = test.loc[(test['Country_Region']=='Italy')&(test['Date']=='2020-03-31'),'ConfirmedCases_x']",No,5,14.0
"last_fat = test.loc[(test['Country_Region']=='Italy')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,14.0
last_fat.values[0],No,5,41.0
"test.loc[(test['Country_Region']=='Italy')] #&(test['Date']==date),'ConfirmedCases_x' ",No,5,14.0
"for date in dates:
    k = k-1
    i = i+1
    test.loc[(test['Country_Region']=='Italy')&(test['Date']==date),
            'ConfirmedCases_x']=last_amount.values[0] + i*(5000-(100*i))
    test.loc[(test['Country_Region']=='Italy')&(test['Date']==date),
             'Fatalities_x'] =  last_fat.values[0]+i*(800-(10*i))",No,5,8.0
"last_amount = test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']=='2020-03-31'),'ConfirmedCases_x']
last_fat = test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,77.0
"i = 0
k = 30
for date in dates:
    k = k-1
    i = i+1
    test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']==date),
             'Fatalities_x']= last_fat.values
    test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']==date),
             'ConfirmedCases_x']= last_amount.values + i",No,5,8.0
"last_amount = test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']=='2020-03-31'),'ConfirmedCases_x']
last_fat = test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,14.0
"k=30
i=0
for date in dates:
    k = k-1
    i = i+1
    test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']==date),'ConfirmedCases_x']= last_amount.values[0]
    test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']==date),'Fatalities_x']= last_fat.values[0] + i ",No,5,8.0
sub,No,5,41.0
"sub.loc[sub['ConfirmedCases']<0,'ConfirmedCases']=0",No,5,8.0
"sub.loc[sub['Fatalities']<0, 'Fatalities']=0",No,5,14.0
sub['Fatalities'].describe(),No,5,40.0
sub['ConfirmedCases'].describe(),No,5,40.0
"sub.to_csv('submission.csv',index=False)",No,5,25.0
"# customize loss function which is aligned with kaggle evaluation
def root_mean_squared_log_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred + 1) - K.log(y_true + 1)))) ",No,3,28.0
"#declaring only one model
def GRU_model(n_1, input_dim, output_dim):
    model = Sequential()
    model.add(GRU(n_1,input_shape=(1, input_dim), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(output_dim, activation='relu'))
    model.compile(loss=root_mean_squared_log_error, optimizer='adam')
    print(model.summary())
    return model",No,5,4.0
"#TODO: debug sometimes it's getting inf. Suspect bad input
model_cc = GRU_model(4, X_train_cc.shape[-1], y_train_cc.shape[-1])
model_ft = GRU_model(4, X_train_ft.shape[-1], y_train_ft.shape[-1])
early_stop = EarlyStopping(monitor='loss', patience=5, verbose=0, mode='min')",No,5,4.0
"groups_dict =dfg.groups
for group, indexes in groups_dict.items():
    print(group)
    tempdf = df.loc[indexes[0]:indexes[-1]]
    print(tempdf.shape)
    if False:
        tempdf[""Growth""] = tempdf.ConfirmedCases/tempdf.ConfirmedCases.shift(1)
        tempdf[""FGrowth""] = tempdf.Fatalities/tempdf.Fatalities.shift(1)
        tempdf.plot(""Date"", [""Growth"",""FGrowth""])",No,2,21.0
"b""dfa = df[df.Country_Region == 'Spain'][df.Province_State.isnull()][df.ConfirmedCases>10]\nprint(dftesta.shape)\nprint(dfa.shape)\n\nfrom matplotlib import pyplot\nfrom statsmodels.tsa.ar_model import AR\nfrom sklearn.metrics import mean_squared_error\n\nX = list(dfa.GrowthRate.values)\n\nX = [x for x in X if not np.isnan(x) and not np.isinf(x)]\nprint(len(X))\n\ntrain, test = X[:len(X)-6], X[len(X)-6:len(X)]\nprint(len(train))\n# train autoregression\nmodel = AR(train)\nmodel_fit = model.fit()\nwindow = model_fit.k_ar\ncoef = model_fit.params\n# walk forward over time steps in test\nhistory = train[len(train)-window:]\nhistory = [history[i] for i in range(len(history))]\npredictions = list()\nfor t in range(len(test)+31):\n\tlength = len(history)\n\tlag = [history[i] for i in range(length-window,length)]\n\tyhat = coef[0]\n\tfor d in range(window):\n\t\tyhat += coef[d+1] * lag[window-d-1]\n\tif t >= len(test):\n\t\ttest.append(yhat)\n\tobs = test[t]\n\tpredictions.append(yhat)\n\thistory.append(obs)\n\tprint('predicted=%f, expected=%f' % (yhat, obs))\nerror = mean_squared_error(test, predictions)\nprint('Test MSE: %.3f' % error)\n# plot\npyplot.plot(train+test)\npyplot.plot(train+predictions, color='red')\npyplot.show()""",No,4,8.0
"b""dfa = df[df.Country_Region == 'Italy'][df.Province_State.isnull()][df.ConfirmedCases>10]\nprint(dftesta.shape)\nprint(dfa.shape)\n\nfrom matplotlib import pyplot\nfrom statsmodels.tsa.ar_model import AR\nfrom sklearn.metrics import mean_squared_error\n\nX = list(dfa.GrowthRate.values)\n\nX = [x for x in X if not np.isnan(x) and not np.isinf(x)]\nprint(len(X))\n\ntrain, test = X[10:len(X)-6], X[len(X)-6:len(X)]\nprint(len(train))\n# train autoregression\nmodel = AR(train)\nmodel_fit = model.fit()\nprint('Lag: %s' % model_fit.k_ar)\nprint('Coefficients: %s' % model_fit.params)\n# make predictions\npredictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)\nfor i in range(len(predictions)):\n\tprint('predicted=%f, expected=%f' % (predictions[i], test[i]))\nerror = mean_squared_error(test, predictions)\nprint('Test MSE: %.3f' % error)\n# plot results\npyplot.plot(train[:15]+test)\npyplot.plot(train[:15]+list(predictions), color='red')\npyplot.show()""",No,4,7.0
"#dfsubmission.to_csv('submission.csv', index=False)  ",No,4,7.0
"import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB",No,1,45.0
import pandas as pd,No,5,22.0
"train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')",No,5,45.0
test.info(),No,5,40.0
train.sample(3),No,5,41.0
"train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])",No,5,16.0
"train['Date'] = train['Date'].astype('int64')
test['Date'] = test['Date'].astype('int64')",No,5,16.0
"from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
def FunLabelEncoder(df):
    for c in df.columns:
        if df.dtypes[c] == object:
            le.fit(df[c].astype(str))
            df[c] = le.transform(df[c].astype(str))
    return df",No,5,20.0
"train.iloc[:,:-2].sample(3)",No,5,41.0
"X = train.iloc[:,:-2]
print(X.shape)
X.sample(3)",No,3,41.0
"Y = train.iloc[:,-2:]
print(Y.shape)
Y.sample(3)",No,3,41.0
"from sklearn.model_selection import train_test_split 
trainX , valX, trainY, valY = train_test_split(X, Y, random_state=1)",No,5,13.0
"y1Train = trainY.iloc[:,0]
print(y1Train.shape)
y1Train.sample(3)",No,3,41.0
"y2Train = trainY.iloc[:,1]
y2Train.sample(3)",No,5,41.0
"y1Val = valY.iloc[:,0]
y1Val.sample(3)",No,5,41.0
"y2Val = valY.iloc[:,1]
y2Val.sample(3)",No,4,41.0
"print(trainX.shape)
trainX.sample(3)",No,4,41.0
print(trainX.info()),No,5,40.0
"trainX.iloc[:,1:].sample(3)",No,5,41.0
"from sklearn.tree import DecisionTreeRegressor
lrModel1 = DecisionTreeRegressor(random_state = 27)
%time lrModel1.fit(trainX.iloc[:,1:], y1Train)",No,3,4.0
"%time y1Pred = lrModel1.predict(valX.iloc[:,1:])
print(y1Pred[:,])",No,4,27.0
"from sklearn.metrics import mean_absolute_error

print(""Accuracy in train set : "", lrModel1.score(trainX.iloc[:,1:], y1Train))
print(""RMSE : "", mean_absolute_error(y1Val, y1Pred)**(0.5))",No,4,28.0
"print(test.shape)
test.sample(3)",No,3,41.0
"test.iloc[:,1:].sample(3)",No,5,41.0
"%time finalPred1 = lrModel1.predict(test.iloc[:,1:])
print(finalPred1[:,])",No,5,48.0
"%time finalPred2 = lrModel2.predict(test.iloc[:,1:])
print(finalPred2[:,])",No,5,48.0
"outputFile = pd.DataFrame({""ForecastId"": test.ForecastId,
                           ""ConfirmedCases"": (finalPred1+0.5).astype('int'),
                           ""Fatalities"": (finalPred2+0.5).astype('int')})'",No,5,12.0
outputFile.sample(3),No,5,41.0
"outputFile.to_csv(""submission.csv"", index=False)",No,5,25.0
"import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly
plotly.offline.init_notebook_mode() # For not show up chart error

import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
%matplotlib inline

from tqdm import tqdm

def RMSLE(pred,actual):
    return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))",No,4,22.0
"# Fix error in train data
FirstDate = train.groupby('Country_Region').min()['Date'].unique()[0]
train['Last Confirm'] = train['ConfirmedCases'].shift(1)
while train[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate)].shape[0] > 0:
    train['Last Confirm'] = train['ConfirmedCases'].shift(1)
    train['Last Fatalities'] = train['Fatalities'].shift(1)
    train.loc[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate),'ConfirmedCases'] = train.loc[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate),'Last Confirm']
    train.loc[(train['Last Fatalities'] > train['Fatalities']) & (train['Date'] > FirstDate),'Fatalities'] = train.loc[(train['Last Fatalities'] > train['Fatalities']) & (train['Date'] > FirstDate),'Last Fatalities']
    train['Last Confirm'] = train['ConfirmedCases'].shift(1)
    train['Last Fatalities'] = train['Fatalities'].shift(1)",No,5,8.0
"RMSLE(df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases'].values,df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases_hat'].values)",No,5,49.0
"RMSLE(df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities'].values,df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities_hat'].values)",No,5,49.0
"val_score = []
for country in df_val['Country_Region'].unique():
    df_val_country = df_val[(df_val['Country_Region'] == country) & (df_val['Fatalities'].isnull() == False)]
    val_score.append([country, RMSLE(df_val_country['ConfirmedCases'].values,df_val_country['ConfirmedCases_hat'].values),RMSLE(df_val_country['Fatalities'].values,df_val_country['Fatalities_hat'].values)])
    
df_val_score = pd.DataFrame(val_score) 
df_val_score.columns = ['Country','ConfirmedCases_Scored','Fatalities_Scored']
df_val_score.sort_values('ConfirmedCases_Scored', ascending = False)",No,3,49.0
"country = ""Vietnam""
df_val = df_val_1
df_val[df_val['Country_Region'] == country].groupby(['Date','Country_Region']).sum().reset_index()'",No,2,41.0
"country = ""Vietnam""
df_val = df_val_1
df_country = df_val[df_val['Country_Region'] == country].groupby(['Date','Country_Region']).sum().reset_index()
df_train = train[(train['Country_Region'].isin(df_country['Country_Region'].unique())) & (train['ConfirmedCases'] > 0)].groupby(['Date']).sum().reset_index()

idx = df_country[((df_country['ConfirmedCases'].isnull() == False) & (df_country['ConfirmedCases'] > 0))].shape[0]
fig = px.line(df_country, x=""Date"", y=""ConfirmedCases_hat"", title='Forecast Total Cases of ' + df_country['Country_Region'].values[0])
fig.add_scatter(x=df_train['Date'], y=df_train['ConfirmedCases'], mode='lines', name=""Actual train"", showlegend=True)
fig.add_scatter(x=df_country['Date'][0:idx], y=df_country['ConfirmedCases'][0:idx], mode='lines', name=""Actual test"", showlegend=True)
fig.show()

fig = px.line(df_country, x=""Date"", y=""Fatalities_hat"", title='Forecast Total Fatalities of ' + df_country['Country_Region'].values[0])
fig.add_scatter(x=df_train['Date'], y=df_train['Fatalities'], mode='lines', name=""Actual train"", showlegend=True)
fig.add_scatter(x=df_country['Date'][0:idx], y=df_country['Fatalities'][0:idx], mode='lines', name=""Actual test"", showlegend=True)

fig.show()'",No,3,33.0
"df_total = df_val.groupby(['Date']).sum().reset_index()
df_train = train[(train['Country_Region'].isin(df_val['Country_Region'].unique())) & (train['ConfirmedCases'] > 0)].groupby(['Date']).sum().reset_index()

idx = df_total[((df_total['ConfirmedCases'].isnull() == False) & (df_total['ConfirmedCases'] > 0))].shape[0]
fig = px.line(df_total, x=""Date"", y=""ConfirmedCases_hat"", title='Total Cases of World Forecast')
fig.add_scatter(x=df_train['Date'], y=df_train['ConfirmedCases'], mode='lines', name=""Actual train"", showlegend=True)
fig.add_scatter(x=df_total['Date'][0:idx], y=df_total['ConfirmedCases'][0:idx], mode='lines', name=""Actual test"", showlegend=True)
fig.show()

fig = px.line(df_total, x=""Date"", y=""Fatalities_hat"", title='Total Fatalities of World Forecast')
fig.add_scatter(x=df_train['Date'], y=df_train['Fatalities'], mode='lines', name=""Actual train"", showlegend=True)
fig.add_scatter(x=df_total['Date'][0:idx], y=df_total['Fatalities'][0:idx], mode='lines', name=""Actual test"", showlegend=True)
fig.show()'",No,4,33.0
"import requests
from bs4 import BeautifulSoup

req = requests.get('https://www.worldometers.info/coronavirus/')
soup = BeautifulSoup(req.text, ""lxml"")

df_country = soup.find('div',attrs={""id"" : ""nav-tabContent""}).find('table',attrs={""id"" : ""main_table_countries_today""}).find_all('tr')
arrCountry = []
for i in range(1,len(df_country)-1):
    tmp = df_country[i].find_all('td')
    if (tmp[0].string.find('<a') == -1):
        country = [tmp[0].string]
    else:
        country = [tmp[0].a.string] # Country
    for j in range(1,7):
        if (str(tmp[j].string) == 'None' or str(tmp[j].string) == ' '):
            country = country + [0]
        else:
            country = country + [float(tmp[j].string.replace(',','').replace('+',''))]
    arrCountry.append(country)
df_worldinfor = pd.DataFrame(arrCountry)
df_worldinfor.columns = ['Country','Total Cases','Cases','Total Deaths','Deaths','Total Recovers','Active Case']
for i in range(0,len(df_worldinfor)):
    df_worldinfor['Country'].iloc[i] = df_worldinfor['Country'].iloc[i].strip()'",No,5,53.0
"fig = px.bar(df_worldinfor.sort_values('Total Cases', ascending=False)[:10][::-1], 
             x='Total Cases', y='Country',
             title='Total Cases Worldwide', text='Total Cases', orientation='h')
fig.show()

fig = px.bar(df_worldinfor.sort_values('Cases', ascending=False)[:10][::-1], 
             x='Cases', y='Country',
             title='New Cases Worldwide', text='Cases', orientation='h')
fig.show()

fig = px.bar(df_worldinfor.sort_values('Active Case', ascending=False)[:10][::-1], 
             x='Active Case', y='Country',
             title='Active Cases Worldwide', text='Active Case', orientation='h')
fig.show()",No,5,33.0
df_worldinfor[df_worldinfor['Country'] == 'Vietnam'],No,5,14.0
"import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler",No,5,22.0
"raw_train_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')",No,5,45.0
"# change dtypes
train_data = raw_train_data
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['Province_State'] = train_data['Province_State'].fillna('None')",No,3,16.0
"# scale features independently
case_scaler = MinMaxScaler(feature_range=(0, 100))
fat_scaler = MinMaxScaler(feature_range=(0, 100))
train_data['ConfirmedCases_scaled'] = case_scaler.fit_transform(train_data['ConfirmedCases'].values.reshape(-1,1))
train_data['Fatalities_scaled'] = fat_scaler.fit_transform(train_data['Fatalities'].values.reshape(-1,1))",No,5,18.0
"# plot global cases and fatalities temporally
global_cases = train_data.groupby(['Date'])[['ConfirmedCases','Fatalities']].sum()

fig,ax = plt.subplots(figsize=(8,5))
_=ax.plot(global_cases['ConfirmedCases'],label='Cases',c='k')
_=ax.plot(global_cases['Fatalities'],label='Deaths',c='r')
_=ax.xaxis.set_tick_params(rotation=15)
_=sns.despine()
_=ax.legend(loc=0)
_=ax.set(title=('Global cumulative cases and deaths'))",No,4,33.0
"fig,ax = plt.subplots(figsize=(8,5))
for i in highest_cases_countries.index:
    _=ax.plot(regions[regions['Country_Region']==i]['Date'],
             regions[regions['Country_Region']==i]['ConfirmedCases'],label=i)
    _=ax.legend()
    _=ax.xaxis.set_tick_params(rotation=15)
    _=sns.despine()
    _=ax.set(title='Regions with highest cases')",No,5,33.0
reshaped_data.shape,No,5,58.0
model_data.shape,No,5,58.0
"# LSTM input is 3D (samples, timesteps, feats), output is 2D(samples,feats)
# cross val data
restack = reshaped_data.values
x = restack[:,:-2].reshape(len(restack),n_in,2)
y = restack[:,-2:]",No,4,13.0
"print(x.shape,y.shape)",No,5,58.0
"def build_regressor(optimizer,lstm_nodes):
    
    # define model
    model = Sequential()
    model.add(LSTM(lstm_nodes, activation='relu', input_shape=(n_in,2)))
    model.add(Dense(2, activation='relu'))
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error')
    
    return model",No,5,4.0
"# scikit wrapper function
regressor = KerasRegressor(build_fn = build_regressor,verbose=0)

# grid search parameters
parameters = {'lstm_nodes':[14,16,20],
             'nb_epoch':[50],
             'batch_size':[32],
             'optimizer':['adam']}


gridsearch = GridSearchCV(estimator = regressor,
                 param_grid = parameters,
                 scoring = 'neg_mean_squared_log_error',
                 cv = 10,
                 n_jobs = -1,
                 verbose =0)",No,3,5.0
"# restack train and val 
gridsearch = gridsearch.fit(x,y)",No,5,6.0
gridsearch.cv_results_,No,4,1.0
gridsearch.best_params_,No,3,2.0
"# build final model with best params from grid search cv
final_regressor = build_regressor(optimizer=gridsearch.best_params_['optimizer'],
                                 lstm_nodes=gridsearch.best_params_['lstm_nodes'])",No,5,4.0
"# fit final model
final_regressor.fit(x, y, 
                      epochs=gridsearch.best_params_['nb_epoch'],
                      batch_size=gridsearch.best_params_['batch_size'], 
                      verbose=0, 
                      shuffle=False)",No,5,7.0
"test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data.Date.max()-test_data.Date.min()",No,3,16.0
"# first batch of predictions
first_predict_date = pd.to_datetime('2020-03-19')
pred_data = {key:region_dfs[key].loc[(region_dfs[key]['Date']>=(first_predict_date-pd.DateOffset(days=n_in)))&
                                    (region_dfs[key]['Date']<first_predict_date)] 
             for key in region_dfs}
test_reshaped = [pred_data[i][['ConfirmedCases_scaled','Fatalities_scaled']].values.reshape(1,n_in,2) for i in pred_data]
first_input = np.vstack(test_reshaped)
first_pred = final_regressor.predict(first_input)",No,3,27.0
"# iterate prediction output back into model input for the next days
pin = [first_input]
pout = [first_pred]

# first prediction is done outside of loop, need to loop for following 41 days
for i in range(42):
    p = final_regressor.predict(pin[i])
    pout.append(p)
    t= np.insert(pin[i],n_in,pout[i],axis=1)[:,1:,:]
    pin.append(t)",No,4,27.0
"# create the prediction dataframe
pred_df = pd.DataFrame(np.concatenate(pout))
pred_df.columns = ['ConfirmedCases_scaled','Fatalities_scaled']
pred_df['Date'] = np.repeat(test_data['Date'].unique(),294)
pred_df['Province_State'] = list(test_data.drop_duplicates(subset=['Province_State','Country_Region'])['Province_State'])*43
pred_df['Country_Region'] = list(test_data.drop_duplicates(subset=['Province_State','Country_Region'])['Country_Region'])*43

pred_df = pred_df.sort_values(by=['Country_Region','Date']).reset_index(drop=True)",No,4,12.0
"# inverse scale results
pred_df['ConfirmedCases'] = (
    case_scaler.inverse_transform(pred_df['ConfirmedCases_scaled'].values.reshape(1,-1))[0])

pred_df['Fatalities'] = (
    fat_scaler.inverse_transform(pred_df['Fatalities_scaled'].values.reshape(1,-1))[0])",No,5,18.0
pred_df[pred_df['Date']=='2020-04-30'].sum(),No,2,40.0
"raw_train_data['Date'] = pd.to_datetime(raw_train_data['Date'])
test_check = raw_train_data[raw_train_data['Date']>'2020-03-18']
model_check = pred_df[pred_df['Date']<=test_check['Date'].max()]",No,4,53.0
"np.sqrt(mean_squared_log_error(y_true = test_check[['ConfirmedCases','Fatalities']],
                       y_pred = model_check[['ConfirmedCases','Fatalities']]))",No,5,49.0
"sub = pred_df[['ConfirmedCases','Fatalities']]
sub['ForecastId'] = test_data['ForecastId']",No,5,55.0
sub.sample(20),No,5,41.0
"sub.to_csv(""submission.csv"",index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import warnings
warnings.filterwarnings(""ignore"")",No,3,22.0
"train = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv')
train.info()",No,4,45.0
train.head(),No,5,41.0
train.tail(),No,5,41.0
train.describe(),No,5,40.0
train.corr(),No,5,40.0
train.isnull().sum(),No,5,39.0
"test = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv')
test.info()",No,3,45.0
test.describe(),No,5,40.0
test.isnull().sum(),No,5,39.0
"#Changing dtype for dates from object to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])",No,5,16.0
train[train['Province_State'].isnull()]['Country_Region'].unique(),No,3,57.0
train[train['Province_State'].notnull()]['Country_Region'].unique(),No,4,57.0
"train['Province_State'] = np.where(train['Province_State'].isnull(), train['Country_Region'], train['Province_State'])
test['Province_State'] = np.where(test['Province_State'].isnull(), test['Country_Region'], test['Province_State'])",No,5,17.0
train[train['Province_State'] == 'Diamond Princess'],No,4,41.0
train[train['Province_State'] == 'Diamond Princess']['Country_Region'].unique(),No,4,57.0
df = train.append(test),No,5,11.0
"group = df.groupby(['Province_State', 'Country_Region'])['Date'].count().reset_index()
group",No,5,60.0
df[df['Province_State'] == 'Georgia']['Country_Region'].unique(),No,4,57.0
"#Distinguishing Province/State Georgia according to Country/Region
df['Province_State'] = np.where((df['Country_Region'] == 'Georgia') & (df['Province_State'] == 'Georgia'), 
                                'Country Georgia', df['Province_State'])",No,5,8.0
"#Viewing the total number of confirmeed cases and fatalities worldwide
world = train.groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index()

plt.plot(world['Date'], world['ConfirmedCases'], label = 'Confirmed Cases')
plt.plot(world['Date'], world['Fatalities'], label = 'Fatalities')
plt.legend()
plt.title('Total number of Confirmed Cases and Fatalities Worldwide')
plt.xticks(rotation = 30)
plt.show();",No,5,75.0
"#Plotting the number of confirmed cases and fatalities for each country
country = train.groupby('Country_Region')['ConfirmedCases', 'Fatalities'].sum().reset_index()

fig = plt.figure(figsize = (15, 25))
ax = fig.add_subplot(111)
ax.barh(country['Country_Region'], country['ConfirmedCases'],label = 'Confirmed Cases')
ax.barh(country['Country_Region'], country['Fatalities'],label = 'Fatalities')
ax.legend()
ax.set_title('Total Confirmed Cases and Fatalities by Country');",No,4,33.0
"#Viewing the top 15 countries with the most confirmed cases
ranked = country.sort_values(by = 'ConfirmedCases', ascending = False)[:15]
ranked",No,5,9.0
"#Plotting confirmed cases and fatalities for the 15 countries with the most cases
countries = ['China', 'Italy', 'US', 
             'Spain', 'Germany', 'Iran', 
             'France', 'Korea, South', 'United Kingdom', 
             'Switzerland', 'Netherlands', 'Belgium', 
             'Austria', 'Turkey', 'Canada']

for c in countries:
    group = train[train['Country_Region'] == c].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index()
    group['ConfirmedCases'].plot(label = 'Confirmed Cases')
    group['Fatalities'].plot(label = 'Fatalities')
    plt.legend()
    plt.title(c)
    plt.show();",No,4,33.0
"from statsmodels.tsa.seasonal import seasonal_decompose

def trends(country, case):
    group = train[train['Country_Region'] == country].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index()
    decomposition = seasonal_decompose(group[case], freq = 3)
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    
    plt.subplot(411)
    plt.plot(group[case], label= case)
    plt.legend(loc='best')
    plt.title('Original')
    plt.subplot(412)
    plt.plot(trend, label=case)
    plt.legend(loc='best')
    plt.title('Trend')
    plt.subplot(413)
    plt.plot(seasonal,label=case)
    plt.legend(loc='best')
    plt.title('Seasonality')
    plt.subplot(414)
    plt.plot(residual, label=case)
    plt.legend(loc='best')
    plt.title('Residual')
    plt.tight_layout();",No,3,33.0
"from statsmodels.tsa.stattools import adfuller
def stationarity_test(country, case):
    timeseries = train[train['Country_Region'] == country].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index()
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries[case], autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value',
                                             '#Lags Used',
                                             'Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)",No,5,47.0
"stationarity_test('US', 'ConfirmedCases')",No,5,47.0
"from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error",No,5,22.0
"def comb_p_d_q(pVals,dVals,qVals):
    return [(p,d,q) for p in pVals for d in dVals for q in qVals]",No,5,53.0
"#List of combinations for pdq 
pdq_results = comb_p_d_q([0,1,2],[0,1,2],[0,1,2])
pdq_results",No,5,53.0
"df.drop_duplicates(subset = ['Date', 'Province_State'], keep = 'last', inplace = True)",No,5,19.0
"from datetime import timedelta

TEST_DAYS = 7

TRAIN_LAST =  - timedelta(days=TEST_DAYS)

TEST_FIRST = sub_df[""Date""].min()
TEST_DAYS = (df[""Date""].max() - TEST_FIRST).days + 1

dev_df, test_df = df[df[""Date""] < TEST_FIRST].copy(), df[df[""Date""] >= TEST_FIRST].copy()
dev_df.shape, test_df.shape",No,3,14.0
"from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
                  ('linear', LinearRegression())])

features = [""prev_{}"".format(col) for col in TARGETS]

model.fit(dev_df[features], dev_df[TARGETS])

[mean_squared_error(dev_df[TARGETS[i]], model.predict(dev_df[features])[:, i]) for i in range(len(TARGETS))]'",No,3,22.0
"def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate(df):
    error = 0
    for col in TARGETS:
        error += rmse(df[col].values, df[""pred_{}"".format(col)].values)
    return np.round(error/len(TARGETS), 5)


def predict(test_df, first_day, num_days, val=False):

    y_pred = np.clip(model.predict(test_df.loc[test_df[""Date""] == first_day][features]), None, 16)

    for i, col in enumerate(TARGETS):
        test_df[""pred_{}"".format(col)] = 0
        test_df.loc[test_df[""Date""] == first_day, ""pred_{}"".format(col)] = y_pred[:, i]

    if val:
        print(first_day, evaluate(test_df[test_df[""Date""] == first_day]))

    for d in range(1, num_days):
        y_pred = np.clip(model.predict(y_pred), None, 16)
        date = first_day + timedelta(days=d)

        for i, col in enumerate(TARGETS):
            test_df.loc[test_df[""Date""] == date, ""pred_{}"".format(col)] = y_pred[:, i]

        if val:
            print(date, evaluate(test_df[test_df[""Date""] == date]))
        
    return test_df

test_df = predict(test_df, TEST_FIRST, TEST_DAYS, val=True)
evaluate(test_df)",No,2,27.0
"for col in TARGETS:
    test_df[col] = np.expm1(test_df[col])
    test_df[""pred_{}"".format(col)] = np.expm1(test_df[""pred_{}"".format(col)])",No,5,8.0
"SUB_FIRST = sub_df[""Date""].min()
SUB_DAYS = (sub_df[""Date""].max() - sub_df[""Date""].min()).days + 1

sub_df = dev_df.append(sub_df, sort=False)

for col in TARGETS:
    sub_df[""prev_{}"".format(col)] = sub_df.groupby(loc_group)[col].shift()
    
sub_df = sub_df[sub_df[""Date""] >= SUB_FIRST].copy()
sub_df[""ForecastId""] = sub_df[""ForecastId""].astype(np.int16)
sub_df = predict(sub_df, SUB_FIRST, SUB_DAYS)

for col in TARGETS:
    sub_df[col] = np.expm1(sub_df[""pred_{}"".format(col)])
    
sub_df.head()",No,4,8.0
"sub_df.to_csv(""submission.csv"", index=False, columns=[""ForecastId""] + TARGETS)",No,5,25.0
"submission = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")

submission",No,5,45.0
"submission['ConfirmedCases'] = sub_df['ConfirmedCases']
submission['Fatalities'] = sub_df['Fatalities']
submission.to_csv('submission.csv', index=False)
submission",No,5,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt",No,5,22.0
"from sklearn.linear_model import (
    ElasticNet,
    ElasticNetCV,
    Lasso,
    LassoCV,
    LinearRegression,
    LogisticRegression,
    Ridge,
)

from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    RandomForestClassifier,
    RandomForestRegressor,
    VotingClassifier,
)

from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)",No,5,22.0
"import seaborn as sns
from sklearn.base import BaseEstimator
from xgboost import XGBClassifier, XGBRegressor
import hyperopt as hp
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe",No,5,22.0
"#Import Date
xtrain = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
xtest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
xsubmission = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/submission.csv"")
# view shape of test and train data
print(xtrain.shape)
print(xtest.shape)",No,4,45.0
"# view head of train data
xtrain.head(10)",No,5,41.0
"# view head of test data
xtest.head(10)",No,5,41.0
"# view head of submission data
xsubmission.head(10)",No,5,41.0
"# date wise value count
xtrain['Date'].value_counts()",No,4,54.0
"# create ConfirmedCasesgroup by Province_State
statevalue = xtrain.groupby('Province_State').max().ConfirmedCases",No,5,60.0
"# view top state conformed cases in a barplot
top_states = statevalue.sort_values(ascending = False).head(10)
sns.barplot(x=top_states.index, y=top_states.values)
plt.xticks(rotation = 'vertical')",No,5,33.0
"# make data as integer
xtrain.ConfirmedCases = xtrain.ConfirmedCases.astype('int64')
xtrain.Fatalities = xtrain.Fatalities.astype('int64')",No,5,16.0
"# Date wise confirm case view in an lineplot
plt.figure(figsize=(15,6))
sns.lineplot(x=xtrain.Date,y=xtrain.ConfirmedCases,markers=True,style=True)
plt.xticks(rotation = 'vertical')",No,5,75.0
"# Date wise Fatalities view in an lineplot
plt.figure(figsize=(15,6))
sns.lineplot(x=xtrain.Date,y=xtrain.Fatalities,markers=True,style=True)
plt.xticks(rotation = 'vertical')",No,5,33.0
"# ConfirmedCases and Fatalities column groupby Country Region
df_xtrain = xtrain.groupby(['Country_Region'])[['ConfirmedCases', 'Fatalities']].max()
print(df_xtrain.sort_values(by=['ConfirmedCases','Fatalities'],ascending=False).head(10))",No,4,60.0
"# view countrywise ConfirmedCases and Fatalities in a plot
fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(40)
ax.plot(df_xtrain[:29].index.values,df_xtrain[:29].ConfirmedCases, color=""red"", marker=""o"")
ax.set_xlabel(""Countries"",fontsize=24)
ax.set_ylabel(""Confirmed Cases"",color=""red"",fontsize=24)
ax.tick_params(axis = 'both', which = 'major', labelsize = 24,labelrotation=90)
ax2=ax.twinx()
ax2.plot(df_xtrain[:29].index.values,df_xtrain[:29].Fatalities,color=""blue"",marker=""o"")
ax2.set_ylabel(""Fatalities"",color=""blue"",fontsize=24)
ax2.tick_params(axis = 'both', which = 'major', labelsize = 24)
plt.show()'",No,5,33.0
"# ConfirmedCases and Fatalities data Analysis Exclude China and view in two Plot
confirmed_total_date_noChina = xtrain[xtrain['Country_Region']!='China'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_noChina = xtrain[xtrain['Country_Region']!='China'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_noChina = confirmed_total_date_noChina.join(fatalities_total_date_noChina)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
total_date_noChina.plot(ax=ax1)
ax1.set_title(""Global confirmed cases excluding China"", size=13)
ax1.set_ylabel(""Number of cases"", size=13)
ax1.set_xlabel(""Date"", size=13)
fatalities_total_date_noChina.plot(ax=ax2, color='orange')
ax2.set_title(""Global deceased cases excluding China"", size=13)
ax2.set_ylabel(""Number of cases"", size=13)
ax2.set_xlabel(""Date"", size=13)'",No,5,75.0
"#ConfirmedCases and Fatalities data Analysis and Visualization for China
confirmed_total_date_China = xtrain[xtrain['Country_Region']=='China'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_China = xtrain[xtrain['Country_Region']=='China'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_China = confirmed_total_date_China.join(fatalities_total_date_China)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
total_date_China.plot(ax=ax1)
ax1.set_title(""China confirmed cases"", size=13)
ax1.set_ylabel(""Number of cases"", size=13)
ax1.set_xlabel(""Date"", size=13)
fatalities_total_date_China.plot(ax=ax2, color='orange')
ax2.set_title(""China Fatalities cases"", size=13)
ax2.set_ylabel(""Number of cases"", size=13)
ax2.set_xlabel(""Date"", size=13)'",No,3,33.0
"#For Itally
confirmed_total_date_Italy = xtrain[xtrain['Country_Region']=='Italy'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Italy = xtrain[xtrain['Country_Region']=='Italy'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Italy = confirmed_total_date_Italy.join(fatalities_total_date_Italy)

#For Spain
confirmed_total_date_Spain = xtrain[xtrain['Country_Region']=='Spain'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Spain = xtrain[xtrain['Country_Region']=='Spain'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Spain = confirmed_total_date_Spain.join(fatalities_total_date_Spain)

plt.figure(figsize=(15,10))

plt.subplot(2, 2, 1)
total_date_Italy.plot(ax=plt.gca(), title='Italy')
plt.ylabel(""Confirmed infection cases"", size=13)

plt.subplot(2, 2, 2)
total_date_Spain.plot(ax=plt.gca(), title='Spain')
plt.ylabel(""Confirmed infection cases"", size=13)'",No,3,33.0
"#For UK
confirmed_total_date_UK = xtrain[xtrain['Country_Region']=='United Kingdom'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_UK = xtrain[xtrain['Country_Region']=='United Kingdom'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_UK = confirmed_total_date_UK.join(fatalities_total_date_UK)


#For Singapore
confirmed_total_date_Singapore = xtrain[xtrain['Country_Region']=='Singapore'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Singapore = xtrain[xtrain['Country_Region']=='Singapore'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Singapore = confirmed_total_date_Singapore.join(fatalities_total_date_Singapore)

plt.figure(figsize=(15,10))

plt.subplot(2, 2, 1)
total_date_UK.plot(ax=plt.gca(), title='United Kingdom')
plt.ylabel(""Confirmed infection cases"", size=13)

plt.subplot(2, 2, 2)
total_date_Singapore.plot(ax=plt.gca(), title='Singapore')
plt.ylabel(""Confirmed infection cases"", size=13)'",No,5,33.0
"#For Australia
confirmed_total_date_Australia = xtrain[xtrain['Country_Region']=='Australia'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Australia = xtrain[xtrain['Country_Region']=='Australia'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Australia = confirmed_total_date_Australia.join(fatalities_total_date_Australia)

#For Bangladesh
confirmed_total_date_Bangladesh = xtrain[xtrain['Country_Region']=='Bangladesh'].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Bangladesh = xtrain[xtrain['Country_Region']=='Bangladesh'].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Bangladesh = confirmed_total_date_Bangladesh.join(fatalities_total_date_Bangladesh)

plt.figure(figsize=(15,10))

plt.subplot(2, 2, 1)
total_date_Australia.plot(ax=plt.gca(), title='Australia')
plt.ylabel(""Confirmed infection cases"", size=13)

plt.subplot(2, 2, 2)
total_date_Bangladesh.plot(ax=plt.gca(), title='Bangladesh')
plt.ylabel(""Confirmed infection cases"", size=13)'",No,4,33.0
"pop_italy = 60486683.
pop_spain = 46749696.
pop_UK = 67784927.
pop_singapore = 5837230.

total_date_Italy.ConfirmedCases = total_date_Italy.ConfirmedCases/pop_italy*100.
total_date_Italy.Fatalities = total_date_Italy.ConfirmedCases/pop_italy*100.
total_date_Spain.ConfirmedCases = total_date_Spain.ConfirmedCases/pop_spain*100.
total_date_Spain.Fatalities = total_date_Spain.ConfirmedCases/pop_spain*100.
total_date_UK.ConfirmedCases = total_date_UK.ConfirmedCases/pop_UK*100.
total_date_UK.Fatalities = total_date_UK.ConfirmedCases/pop_UK*100.
total_date_Singapore.ConfirmedCases = total_date_Singapore.ConfirmedCases/pop_singapore*100.
total_date_Singapore.Fatalities = total_date_Singapore.ConfirmedCases/pop_singapore*100.

plt.figure(figsize=(15,10))
plt.subplot(2, 2, 1)
total_date_Italy.ConfirmedCases.plot(ax=plt.gca(), title='Italy')
plt.ylabel(""Fraction of population infected"")
plt.ylim(0, 0.06)

plt.subplot(2, 2, 2)
total_date_Spain.ConfirmedCases.plot(ax=plt.gca(), title='Spain')
plt.ylim(0, 0.06)

plt.subplot(2, 2, 3)
total_date_UK.ConfirmedCases.plot(ax=plt.gca(), title='United Kingdom')
plt.ylabel(""Fraction of population infected"")
plt.ylim(0, 0.005)

plt.subplot(2, 2, 4)
total_date_Singapore.ConfirmedCases.plot(ax=plt.gca(), title='Singapore')
plt.ylim(0, 0.005)'",No,5,33.0
"# For Itally
confirmed_total_date_Italy = xtrain[(xtrain['Country_Region']=='Italy') & xtrain['ConfirmedCases']!=0].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Italy = xtrain[(xtrain['Country_Region']=='Italy') & xtrain['ConfirmedCases']!=0].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Italy = confirmed_total_date_Italy.join(fatalities_total_date_Italy)

# For Spain
confirmed_total_date_Spain = xtrain[(xtrain['Country_Region']=='Spain') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Spain = xtrain[(xtrain['Country_Region']=='Spain') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Spain = confirmed_total_date_Spain.join(fatalities_total_date_Spain)

# For UK
confirmed_total_date_UK = xtrain[(xtrain['Country_Region']=='United Kingdom') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_UK = xtrain[(xtrain['Country_Region']=='United Kingdom') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_UK = confirmed_total_date_UK.join(fatalities_total_date_UK)

# For Australia
confirmed_total_date_Australia = xtrain[(xtrain['Country_Region']=='Australia') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Australia = xtrain[(xtrain['Country_Region']=='Australia') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Australia = confirmed_total_date_Australia.join(fatalities_total_date_Australia)

# For Singapore
confirmed_total_date_Singapore = xtrain[(xtrain['Country_Region']=='Singapore') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']})
fatalities_total_date_Singapore = xtrain[(xtrain['Country_Region']=='Singapore') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']})
total_date_Singapore = confirmed_total_date_Singapore.join(fatalities_total_date_Singapore)

italy = [i for i in total_date_Italy.ConfirmedCases['sum'].values]
italy_30 = italy[0:50] 
spain = [i for i in total_date_Spain.ConfirmedCases['sum'].values]
spain_30 = spain[0:50] 
UK = [i for i in total_date_UK.ConfirmedCases['sum'].values]
UK_30 = UK[0:50] 
singapore = [i for i in total_date_Singapore.ConfirmedCases['sum'].values]
singapore_30 = singapore[0:50] ",No,4,12.0
"# Plots
plt.figure(figsize=(12,6))
plt.plot(italy_30)
plt.plot(spain_30)
plt.plot(UK_30)
plt.plot(singapore_30)
plt.legend([""Italy"", ""Spain"", ""UK"", ""Singapore""], loc='upper left')
plt.title(""COVID-19 infections from the first confirmed case"", size=15)
plt.xlabel(""Days"", size=13)
plt.ylabel(""Infected cases"", size=13)
plt.ylim(0, 60000)
plt.show()'",No,5,33.0
"# Check if there have any null value
xtrain.isnull().sum()",No,5,39.0
"# CHANGE TO PD.DATETIME
xtrain.Date = pd.to_datetime(xtrain.Date, infer_datetime_format=True)
xtest.Date = pd.to_datetime(xtest.Date, infer_datetime_format=True)",No,5,16.0
"# CONCISING THE TRAIN DATASET TO 18TH MARCH 2020.
MIN_TEST_DATE = xtest.Date.min()
xtrain = xtrain.loc[xtrain.Date < MIN_TEST_DATE, :]",No,4,14.0
"# FILLING MISSING VALUES
xtrain.fillna("""", inplace=True)
xtest.fillna("""", inplace=True)",No,5,17.0
from statsmodels.tsa.arima_model import ARIMA,No,5,22.0
"# DROPPING COUNTRY REGION AND PROVINCE STATE
xtrain.drop(['Country_Region','Province_State'],axis=1,inplace=True)
xtest.drop(['Country_Region','Province_State'],axis=1,inplace=True)",No,5,10.0
"# CONVERTING DATE COLUMN TO INTEGER
xtrain.loc[:, 'Date'] = xtrain.Date.dt.strftime(""%m%d"")
xtest.loc[:, 'Date'] = xtest.Date.dt.strftime(""%m%d"")'",No,5,16.0
"# Region wise Confirmed Cases in LinePlot
sns.lineplot(data=xtrain, x=""Date"", y=""ConfirmedCases"", hue=""Region"")
plt.show()",No,5,75.0
"#  Region wise Fatalities in Line Plot.
sns.lineplot(data=xtrain, x=""Date"", y=""Fatalities"", hue=""Region"")
plt.show()",No,5,33.0
"# CREATING X AND Y for Train Dataset
X1 = xtrain.drop([""ConfirmedCases"", ""Fatalities""], axis=1)
X2 = xtrain.drop([""ConfirmedCases"", ""Fatalities""], axis=1)
y1 = xtrain[""ConfirmedCases""]
y2 = xtrain[""Fatalities""]",No,5,21.0
"# Create TEST 1 AND TEST 2 for Test dataset
test_1 = xtest.copy()
test_2 = xtest.copy()",No,5,12.0
"for f2 in [""Region""]:
    me2 = MeanEncoding(f2, C=0.01 * len(X2[f2].unique()))
    me2.fit(X2, y2)
    X2 = me2.transform(X2)
    test_2 = me2.transform(test_2)",No,4,7.0
"for f1 in [""Region""]:
    me1 = MeanEncoding(f1, C=0.01 * len(X1[f1].unique()))
    me1.fit(X1, y1)
    X1 = me1.transform(X1)
    test_1 = me1.transform(test_1)",No,5,20.0
"# View Test_1
test_1",No,5,53.0
"# View Test_2
test_2",No,5,41.0
"# Load some Basic Library
import matplotlib.pyplot as plt
from sklearn import model_selection
import numpy as np",No,5,22.0
"# FUNCTION FOR COMPARING DIFFERENT REGRESSORS
def algorithim_boxplot_comparison(
    X, y, algo_list=[], random_state=3, scoring=""r2"", n_splits=10
):
    
    results = []
    names = []
    for algo_name, algo_model in algo_list:
        kfold = model_selection.KFold(
            shuffle=True, n_splits=n_splits, random_state=random_state
        )
        cv_results = model_selection.cross_val_score(
            algo_model, X, y, cv=kfold, scoring=scoring
        )
        results.append(cv_results)
        names.append(algo_name)
        msg = ""%s: %s : (%f) %s : (%f) %s : (%f)"" % (
            algo_name,
            ""median"",
            np.median(cv_results),
            ""mean"",
            np.mean(cv_results),
            ""variance"",
            cv_results.var(ddof=1),
        )
        print(msg)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle(""Algorithm Comparison"")
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()",No,5,84.0
"# REGRESSORS
lr = LinearRegression(n_jobs=-1)
rfr = RandomForestRegressor(random_state=96, n_jobs=-1)
gbr = GradientBoostingRegressor(random_state=96)
xgbr = XGBRegressor()",No,5,82.0
"df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})
soln = pd.DataFrame({'ForecastId': test_df.ForecastId, 'ConfirmedCases': Y_pred_CC, 'Fatalities': Y_pred_Fat})
df_out = pd.concat([df_out, soln], axis=0)
df_out.ForecastId = df_out.ForecastId.astype('int')",No,3,12.0
"# APPENDING THE REGRESSORS IN A LIST
models = []
models.append(('lr',lr))
models.append(('rfr',rfr))
models.append(('gbr',gbr))
models.append(('xgbr',xgbr))",No,5,82.0
"df_out.to_csv('submission.csv', index=False)
print(""Your submission was successfully saved!"")'",No,5,25.0
\,No,5,6.0
"def model_eval(case):
    state = ['Italy']
    for s in state:
        train_ts = train[train['Province_State'] == s][:50]
        test_ts = train[train['Province_State'] == s][50:]
        a = 9999
        
        for pdq in pdq_results:
            try:
                model = ARIMA(train_ts[case], order = pdq, dates = train_ts['Date'], freq = 'D')
                model_fit = model.fit()
                aicval = model_fit.aic
            
                if aicval < a:
                    a = aicval
                    param = pdq
            except:
                pass
        
        model = ARIMA(train_ts[case], order = param, dates = train_ts['Date'], freq = 'D')
        model_fit = model.fit()
        model_fit.plot_predict(start = int(len(train_ts) * 0.3), end = int(len(train_ts) * 1.4))
        pred = model_fit.forecast(steps = int(len(test_ts)))[0]
            
        ",No,3,48.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
'",No,5,88.0
model_eval('ConfirmedCases'),No,5,53.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import seaborn as sns
from datetime import datetime",No,5,22.0
model_eval('Fatalities'),No,5,53.0
"def model(case):
    state = df['Province_State'].unique()
    confirmed = []
    for s in state:
        train_ts = df[df['Province_State'] == s][:57]
        pred_ts = df[df['Province_State'] == s][57:]
        a = 9999
        
        for pdq in pdq_results:
            try:
                model = ARIMA(train_ts[case], order = pdq, dates = train_ts['Date'], freq = 'D')
                model_fit = model.fit()
                aicval = model_fit.aic
            
                if aicval < a:
                    a = aicval
                    param = pdq
            except:
                pass
        
        try:
            model = ARIMA(train_ts[case], order = param, dates = train_ts['Date'], freq = 'D')
            model_fit = model.fit()
            pred = model_fit.forecast(steps = int(len(pred_ts)))[0]
            confirmed = np.append(confirmed, pred.tolist())
        except:
            confirmed = np.append(confirmed, np.repeat(0, 43))
            continue
            
    test[case] = confirmed",No,3,48.0
"print('Importing training and test data')
train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')

# Update dataframe
train_df['Province_State'] = train_df['Province_State'].fillna('')
train_df['Region'] = train_df['Country_Region'] + train_df['Province_State']

test_df['Province_State'] = test_df['Province_State'].fillna('')
test_df['Region'] = test_df['Country_Region'] + test_df['Province_State']

regions = train_df.Region.unique()

# Match days in train and test
train_min_date = train_df[train_df['Region']=='Sweden']['Date'].min()
test_min_date = test_df[test_df['Region']=='Sweden']['Date'].min()
dt_train_min = datetime.strptime(train_min_date, '%Y-%m-%d') 
dt_test_min = datetime.strptime(test_min_date, '%Y-%m-%d') 

test_start_day = dt_test_min.timetuple().tm_yday - dt_train_min.timetuple().tm_yday
print(test_start_day)

# Extract dataframes for each country
train_data = {}
test_data = {}
for region in regions:
    train_data[region] = train_df[train_df['Region']==region]    
    train_data[region]['DayNo'] = np.arange(len(train_df[train_df['Region']==region]['Date']))
    test_data[region] = test_df[test_df['Region']==region]    
    test_data[region]['DayNo'] = np.arange(test_start_day,test_start_day+len(test_df[test_df['Region']==region]['Date']))
",No,4,8.0
model('ConfirmedCases'),No,3,48.0
model('Fatalities'),No,3,48.0
"results = test[['ForecastId', 'ConfirmedCases', 'Fatalities']]
results.to_csv('submission.csv', index = False)",No,5,25.0
"train_max_date = train_df[train_df['Region']=='Sweden']['Date'].max()
dt_train_max = datetime.strptime(train_max_date, '%Y-%m-%d')
train_max_day = dt_train_max.timetuple().tm_yday - dt_train_min.timetuple().tm_yday
#print(train_max_day)
#int(train_data[""Sweden""][train_data[""Sweden""]['DayNo']==69]['ConfirmedCases'].tolist()[0])
#test_data[""Sweden""]['DayNo']'",No,5,77.0
"def sigmoid(x, a, b, c):
    return a*np.exp(c*(x-b))/(np.exp(c*(x-b))+1)",No,5,84.0
"import numpy as np
import pandas as pd",No,4,22.0
"# HYPEROPT
def auc_model(params):
    params = {
        ""n_estimators"": int(params[""n_estimators""]),
        ""max_features"": int(params[""max_features""]),
        ""min_samples_leaf"": int(params[""min_samples_leaf""]),
        ""min_samples_split"": int(params[""min_samples_split""]),
    }
    clf = RandomForestRegressor(**params, random_state=96, n_jobs=-1)
    return cross_val_score(
        clf, X1, y1, cv=3, scoring=""neg_mean_squared_log_error""
    ).mean()


params_space = {
    ""n_estimators"": hp.quniform(""n_estimators"", 0, 300, 50),
    ""max_features"": hp.quniform(""max_features"", 1, 3, 1),
    ""min_samples_leaf"": hp.quniform(""min_samples_leaf"", 1, 50, 1),
    ""min_samples_split"": hp.quniform(""min_samples_split"", 1, 50, 1),
}
best = 0


def f(params):
    global best
    auc = auc_model(params)
    if auc > best:
        print(""New Best"", best, params)
    return {""loss"": -auc, ""status"": STATUS_OK}


trials = Trials()
best = fmin(f, params_space, algo=tpe.suggest, max_evals=200, trials=trials)
print(""best:\
"", best)'",No,4,1.0
"failed_confirmed = []
failed_fatalities = []
confirmed_popt = {}
fatalities_popt = {}

for region in regions:
    x_data = train_data[region]['DayNo']
    y_ConfirmedCases_data = train_data[region]['ConfirmedCases']
    y_Fatalities_data = train_data[region]['Fatalities']

    # Fit data to function
    try:
        popt, pcov = curve_fit(sigmoid, x_data, y_ConfirmedCases_data)
        confirmed_popt[region] = popt
    except:
        failed_confirmed.append(region)

    try:
        popt, pcov = curve_fit(sigmoid, x_data, y_Fatalities_data)
        fatalities_popt[region] = popt
    except:
        failed_fatalities.append(region)
        
print(""Failed confirmed: "" + str(len(failed_confirmed)))
print(""Failed fatalities: "" + str(len(failed_fatalities)))
print(""Total: "" + str(len(regions)))'",No,5,84.0
"# Handle failed data
confirmed_coeffs = [x for x in confirmed_popt.values()] 
mean_confirmed_coeffs = np.mean(confirmed_coeffs, axis=0)
print(mean_confirmed_coeffs)

fatalities_coeffs = [x for x in fatalities_popt.values()] 
mean_fatalities_coeffs = np.mean(fatalities_coeffs, axis=0)
print(mean_fatalities_coeffs)

for region in failed_confirmed:
    x_data = train_data[region]['DayNo']
    y_ConfirmedCases_data = train_data[region]['ConfirmedCases']
    # Fit data to function
    try:
        popt, pcov = curve_fit(sigmoid, x_data, y_ConfirmedCases_data, maxfev=1000, ftol=1e-5)
        confirmed_popt[region] = popt
    except:
        start = 0
        for data in y_ConfirmedCases_data:
            if data > 0:
                break
            start = start + 1
        popt = mean_confirmed_coeffs
        popt[1] = start
        confirmed_popt[region] = popt
        print(""Failed for C "" + region + "" : "" + str(popt))
        
        
for region in failed_fatalities:
    x_data = train_data[region]['DayNo']
    y_Fatalities_data = train_data[region]['Fatalities']

    # Fit data to function
    try:
        popt, pcov = curve_fit(sigmoid, x_data, y_Fatalities_data, maxfev=1000, ftol=1e-5)
        fatalities_popt[region] = popt
    except:
        start = 0
        for data in y_Fatalities_data:
            if data > 0:
                break
            start = start + 1
        popt = mean_fatalities_coeffs
        popt[1] = start
        fatalities_popt[region] = popt
        print(""Failed F for "" + region + "" : "" + str(popt))'",No,5,53.0
"sub = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')
test_regions = test_df.Region.unique()
total_count = 0

for region in test_regions:
    forecastIds = test_data[region]['ForecastId']
    x_test_data = test_data[region]['DayNo']
    y_conf_test_data = np.nan_to_num(sigmoid(x_test_data, *confirmed_popt[region])).astype(np.int) 
    x_test_data = test_data[region]['DayNo']
    y_fatal_test_data = np.nan_to_num(sigmoid(x_test_data, *fatalities_popt[region])).astype(np.int) 
    idx = 0
    x_test_data = x_test_data.tolist()
    for id in forecastIds:
        day_no = x_test_data[idx]
        row_index = sub.index[sub['ForecastId'] == id]
        if day_no > train_max_day:
            sub.set_value(row_index, 'ConfirmedCases', y_conf_test_data[idx])
            sub.set_value(row_index, 'Fatalities', y_fatal_test_data[idx])
        else:
            sub.set_value(row_index, 'ConfirmedCases', int(train_data[region][train_data[region]['DayNo']==day_no]['ConfirmedCases'].tolist()[0]))
            sub.set_value(row_index, 'Fatalities', int(train_data[region][train_data[region]['DayNo']==day_no]['Fatalities'].tolist()[0])) 
        idx = idx + 1

sub.to_csv('/kaggle/working/submission.csv', index=False) ",No,3,8.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
trainData = pd.read_csv('../input/train.csv')
testData = pd.read_csv('../input/test.csv')
trainData = trainData.drop('Id', axis=1)
testData = testData.drop('Id', axis=1)",No,3,45.0
"trainData['Open Date'] = pd.to_datetime(trainData['Open Date'], format='%m/%d/%Y')   
testData['Open Date'] = pd.to_datetime(testData['Open Date'], format='%m/%d/%Y')

trainData['OpenDays']=""""
testData['OpenDays']=""""

dateLastTrain = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(trainData)]) })
dateLastTrain['Date'] = pd.to_datetime(dateLastTrain['Date'], format='%m/%d/%Y')  
dateLastTest = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(testData)]) })
dateLastTest['Date'] = pd.to_datetime(dateLastTest['Date'], format='%m/%d/%Y')  

trainData['OpenDays'] = dateLastTrain['Date'] - trainData['Open Date']
testData['OpenDays'] = dateLastTest['Date'] - testData['Open Date']

trainData['OpenDays'] = trainData['OpenDays'].astype('timedelta64[D]').astype(int)
testData['OpenDays'] = testData['OpenDays'].astype('timedelta64[D]').astype(int)

trainData = trainData.drop('Open Date', axis=1)
testData = testData.drop('Open Date', axis=1)'",No,4,16.0
"cityPerc = trainData[[""City Group"", ""revenue""]].groupby(['City Group'],as_index=False).mean()
#sns.barplot(x='City Group', y='revenue', data=cityPerc)

citygroupDummy = pd.get_dummies(trainData['City Group'])
trainData = trainData.join(citygroupDummy)

citygroupDummyTest = pd.get_dummies(testData['City Group'])
testData = testData.join(citygroupDummyTest)

trainData = trainData.drop('City Group', axis=1)
testData = testData.drop('City Group', axis=1)'",No,4,10.0
"#Regression on everything
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(""notebook"", font_scale=1.1)
sns.set_style(""ticks"")

import numpy
xTrain = pd.DataFrame({'OpenDays':trainData['OpenDays'].apply(numpy.log),
                      'Big Cities':trainData['Big Cities'], 'Other':trainData['Other'],
                      'P2':trainData['P2'], 'P8':trainData['P8'], 'P22':trainData['P22'],
                      'P24':trainData['P24'], 'P28':trainData['P28'], 'P26':trainData['P26']})
#xTrain = trainData.drop(['revenue'], axis=1)
#xTrain['OpenDays'] = xTrain['OpenDays'].apply(numpy.log)
yTrain = trainData['revenue'].apply(numpy.log)
xTest = pd.DataFrame({'OpenDays':testData['OpenDays'].apply(numpy.log),
                      'Big Cities':testData['Big Cities'], 'Other':testData['Other'],
                     'P2':testData['P2'], 'P8':testData['P8'], 'P22':testData['P22'],
                      'P24':testData['P24'], 'P28':testData['P28'], 'P26':testData['P26']})

from sklearn import linear_model

cls = RandomForestRegressor(n_estimators=150)
cls.fit(xTrain, yTrain)
pred = cls.predict(xTest)
pred = numpy.exp(pred)
cls.score(xTrain, yTrain)'",No,3,7.0
"pred = cls.predict(xTest)
pred = numpy.exp(pred)",No,5,48.0
pred,No,5,53.0
"pred2 = []
for i in range(len(pred)):
    if pred[i] != float('Inf'):
        pred2.append(pred[i])

m = sum(pred2) / float(len(pred2))

for i in range(len(pred)):
    if pred[i] == float('Inf'):
        print(""haha"")
        pred[i] = m'",No,5,53.0
"# RANDOMFORESTREGRESSOR FOR CONFIRMEDCASUALTIES
rfr1 = RandomForestRegressor(
    max_features=3,
    min_samples_leaf=26,
    min_samples_split=31,
    n_estimators=200,
    random_state=96,
    n_jobs=-1,
)",No,5,4.0
"# RANDOMFORESTREGRESSOR FOR FATALITIES
rfr2 = RandomForestRegressor(
    max_features=3,
    min_samples_leaf=17,
    min_samples_split=17,
    n_estimators=100,
    random_state=96,
    n_jobs=-1,
)",No,5,4.0
"# FITTING RANDOMFORESTREGRESSOR FOR CONFIRMEDCASUALTIES
rfr1.fit(X1, y1)",No,5,7.0
"# PREDICTING CONFIRMEDCASUALTIES using RANDOM FOREST REGRESSOR
y_n_1 = rfr1.predict(test_1)",No,5,48.0
"# Fit CONFIRMEDCASUALTIES using K neareat neighbour algorithm Classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'braycurtis', p = 1)
classifier.fit(X1, y1)",No,4,7.0
"### Predict CONFIRMEDCASUALTIES using K neareat neighbour algorithm Classifier
y_pred1 = classifier.predict(X1)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y1, y_pred1)
from sklearn.metrics import accuracy_score 
print( 'Accuracy Score confirmed cases :',accuracy_score(y1,y_pred1)*100)",No,4,27.0
"# FITTING RANDOMFORESTREGRESSOR FOR FATALITIES
rfr2.fit(X2, y2)",No,5,7.0
"# PREDICTING FATALITIES
y_n_2 = rfr2.predict(test_2)",No,5,48.0
"# ### Fit  Fatalities using K neareat neighbour algorithm Classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'braycurtis', p = 1)
classifier.fit(X2, y2)",No,5,7.0
"### Predict Fatalities using K neareat neighbour algorithm Classifier
y_pred2 = classifier.predict(X2)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y2, y_pred2)
from sklearn.metrics import accuracy_score 
print( 'Accuracy Score confirmed cases :',accuracy_score(y2,y_pred2)*100)",No,3,27.0
"# ADDING CONFIRMEDCASES
xsubmission.ConfirmedCases = round(pd.DataFrame(y_n_1))",No,4,8.0
"
# ADDING FATALITIES
xsubmission.Fatalities = round(pd.DataFrame(y_n_2))",No,4,8.0
"# View submission data
xsubmission",No,5,41.0
"# Save Date to submission file
xsubmission.to_csv(""submission.csv"", index=False)
print(""Submission file create sucessfully"")",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score",No,5,22.0
"Train_data = pd.read_csv(""../input/train.csv"")
Test_data = pd.read_csv(""../input/test.csv"")
ID = Test_data['Id']'",No,5,45.0
Train_data.head(),No,5,41.0
Test_data.head(),No,5,41.0
"train_levels = Train_data.loc[(Train_data['City'].notnull())]
City_counts = train_levels['City'].value_counts().sort_index().to_frame()
City_counts",No,5,72.0
"train_levels = Train_data.loc[(Train_data['Type'].notnull())]
label_counts = train_levels['Type'].value_counts().sort_index().to_frame()
label_counts",No,5,72.0
"#  lie 
del Train_data[""Open Date""]
del Train_data[""City""]
del Train_data[""City Group""]
del Train_data[""Type""]
del Train_data[""Id""]

del Test_data[""Open Date""]
del Test_data[""City""]
del Test_data[""City Group""]
del Test_data[""Type""]
del Test_data[""Id""]'",No,5,10.0
"# 0 
Train_data = Train_data.fillna(0)
Test_data = Test_data.fillna(0)
Test_data.head(10)'",No,4,17.0
"#Regression on everything
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import numpy

sns.set_context(""notebook"", font_scale=1.11)
sns.set_style(""ticks"")

yTrain = Train_data['revenue'].apply(numpy.log)
Train_data = Train_data.drop([""revenue""],1)
xTrain = pd.DataFrame(Train_data)
xTest = pd.DataFrame(Test_data)
'",No,3,22.0
"from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

cls = RandomForestRegressor(n_estimators=170)
cls.fit(xTrain, yTrain)
pred = cls.predict(xTest)
pred = numpy.exp(pred)
closs = cls.score(xTrain, yTrain)
closs",No,4,7.0
"pred = cls.predict(xTest)
pred = numpy.exp(pred)
pred",No,4,27.0
"read_test = {
    ""Id"":ID,
    ""Prediction"":pred
}
read_ = pd.DataFrame(read_test)
read_.to_csv(""sample_submission.csv"",index=False)
",No,4,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.classify import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVR,NuSVR,LinearSVR,SVC #support vector regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso#Ridge() and Lasso()
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
test = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv"")
train = pd.read_csv(""../input/restaurant-revenue-prediction/train.csv"")
train.head()",No,4,45.0
print(train.info()),No,5,40.0
"train['P29'] = train['P29'].astype(int)
test['P29']    = test['P29'].astype(int)
test[""P29""].fillna(test[""P29""].median(), inplace=True)
train['P26'] = train['P26'].astype(int)
test['P26']    = test['P26'].astype(int)
train['P27'] = train['P27'].astype(int)
test['P27']    = test['P27'].astype(int)
train['P28'] = train['P28'].astype(int)
test['P28']    = test['P28'].astype(int)
train['P13'] = train['P13'].astype(int)
test['P13']    = test['P13'].astype(int)
train['P2'] = train['P2'].astype(int)
test['P2']    = test['P2'].astype(int)
train['P3'] = train['P3'].astype(int)
test['P3']    = test['P3'].astype(int)
train['P4'] = train['P4'].astype(int)
test['P4']    = test['P4'].astype(int)'",No,4,16.0
train.describe(include=['object']),No,5,40.0
train['City'].value_counts(),No,5,72.0
train['City Group'].value_counts(),No,5,72.0
train['Type'].value_counts(),No,5,72.0
"corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);",No,5,80.0
"train[""City Group""] = train[""City Group""].map({""Big Cities"": 0, ""Other"":1})
test[""City Group""] = test[""City Group""].map({""Big Cities"": 0, ""Other"":1})
train[""Type""] = train[""Type""].map({""FC"": 0, ""IL"":1,""DT"":2})
test[""Type""] = test[""Type""].map({""FC"": 0, ""IL"":1,""DT"":2})
# Is city important or not
#How can we get groups of revenue and plot it against city groups and types to compare",No,5,20.0
"test[""Type""].fillna(test[""Type""].median(), inplace=True)
train[""revenue""].fillna(train[""revenue""].median(), inplace=True)
train['revenue'] = train['revenue'].astype(int)
import numpy
Y_train=train[""revenue""].apply(numpy.log)
'",No,4,17.0
"X_train = train.drop(['City','Open Date','revenue','Id','City Group'], axis=1)
#X_test  = test.drop(""Id"",axis=1).copy()
X_test    = test.drop(['City','Open Date','Id','City Group'], axis=1)
X_train.head()
#X_test.head()'",No,4,10.0
"#from sklearn.impute import SimpleImputer
#my_imputer = SimpleImputer()
#imputed_X_train = my_imputer.fit_transform(X_train)
#imputed_X_test = my_imputer.transform(X_test)",No,3,12.0
"test.head()
test=test.drop(['City','Open Date','City Group'], axis=1)",No,5,10.0
submission,No,5,41.0
"submission.to_csv('submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,4,22.0
"print('Start model training')
start_time = time.time()
history_cc = model_cc.fit(X_train_cc, y_train_cc, epochs = 100,validation_data = (X_val_cc, y_val_cc), verbose = 2, callbacks=[early_stop])
model_cc.save(""model_cc.h5"")
print('Time spent for model training is {} minutes'.format(round((time.time()-start_time)/60,1)))'",No,4,7.0
"# Plot training & validation loss values
plt.figure(figsize=(8,5))
plt.plot(history_cc.history['loss'])
plt.plot(history_cc.history['val_loss'])
plt.title('CC Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()",No,5,35.0
"print('Start model training')
start_time = time.time()
history_ft = model_ft.fit(X_train_ft, y_train_ft, epochs = 100,validation_data = (X_val_ft, y_val_ft), verbose = 2, callbacks=[early_stop])
model_ft.save(""model_ft.h5"")
print('Time spent for model training is {} minutes'.format(round((time.time()-start_time)/60,1)))'",No,4,7.0
"# Plot training & validation loss values
plt.figure(figsize=(8,5))
plt.plot(history_ft.history['loss'])
plt.plot(history_ft.history['val_loss'])
plt.title('FT Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()",No,5,35.0
"# Validate if output makes sense
yhat_val_cc = model_cc.predict(X_val_cc)
print(yhat_val_cc)",No,5,48.0
print(val_y_cc),No,5,53.0
"# Validate if output makes sense
yhat_val_ft = model_cc.predict(X_val_ft)
print(yhat_val_ft)",No,5,48.0
print(val_y_ft),No,5,53.0
"#TODO: takes 14m ish, consider multi-processing, multi-cores, run in GPU
#TODO: create data_generate func
start_time = time.time()
test['Country_Region'] = test['Country_Region'].astype(str)
test['Province_State'] = test['Province_State'].astype(str)
country = test['Country_Region'].drop_duplicates()
adj_input_cols = [e for e in input_cols if e not in ('province_encoder', 'country_encoder', 'date_int')]
# fill data for overlapped days
test_df = test.copy().join(pd.DataFrame(columns = adj_input_cols + output_cols))
test_df.rename(columns={'Date': 'date', 'Country_Region': 'country', 'Province_State': 'province'}, inplace=True)
lags = np.arange(1,8,1)  # lag of 1 to 7
test_overlap_mask = (test_df['date'] <= train_df['date'].max())
train_overlap_mask = (train_df['date'] >= test_df['date'].min())
test_df.loc[test_overlap_mask, input_cols + output_cols] = train_df.loc[train_overlap_mask, input_cols + output_cols].values

# predict data for forward days
pred_dt_range = pd.date_range(start = train_df['date'].max() + pd.Timedelta(days=1), end = test_df['date'].max(), freq = '1D') # test_df['date'].max()
with tqdm(total = len(pred_dt_range)) as pbar:
    for d in pred_dt_range:
        
        for i in country:
            
            province = test_df[test_df['country'] == i]['province'].drop_duplicates()
            
            for j in province:
                
                mask = (test_df['date'] == d) & (test_df['country'] == i) & (test_df['province'] == j)
                
                
                # update input features for the predicted day
                for lag in lags:
                    mask_org = (test_df['date'] == (d - pd.Timedelta(days=lag))) & (test_df['country'] == i) & (test_df['province'] == j)
                    try:
                        test_df.loc[mask, 'cc_cases_' + str(lag)] = test_df.loc[mask_org, 'cc_cases'].values
                    except:
                        test_df.loc[mask, 'cc_cases_' + str(lag)] = 0

                    try:
                        test_df.loc[mask, 'ft_cases_' + str(lag)] = test_df.loc[mask_org, 'ft_cases'].values
                    except:
                        test_df.loc[mask, 'ft_cases_' + str(lag)] = 0
                
                test_X  = test_df.loc[mask, input_cols]
            
                # predict for comfirmed cases
                test_X_cc = test_X[cc_input]
                X_test_cc= test_X_cc
                # X_test_cc =  X_scaler_cc.transform(test_X_cc) # intput/output 2D array-like
                # X_test_cc = X_test_cc.reshape(X_test_cc.shape[0], 1, X_test_cc.shape[1])
                X_test_cc = X_test_cc.to_numpy().reshape(X_test_cc.shape[0], 1, X_test_cc.shape[1])
                next_cc = model_cc.predict(X_test_cc)
                # next_cc_scaled = y_scaler_cc.inverse_transform(next_cc)
                next_cc_scaled = next_cc
                
                # predict for fatal cases
                test_X_ft = test_X[ft_input]
                X_test_ft = test_X_ft
                # X_test_ft =  X_scaler_ft.transform(test_X_ft) # intput/output 2D array-like
                # X_test_ft = X_test_ft.reshape(X_test_ft.shape[0], 1, X_test_ft.shape[1])
                X_test_ft = X_test_ft.to_numpy().reshape(X_test_ft.shape[0], 1, X_test_ft.shape[1])
                next_ft = model_cc.predict(X_test_ft)
                # next_ft_scaled = y_scaler_cc.inverse_transform(next_ft)
                next_ft_scaled = next_ft
                # print(d, ' - ', i, ' - ', j,  ' - Predicted Confirmed Cases are ', next_cc_scaled, ' - Predicted Fatal Cases are ', next_ft_scaled)
                
                # update yhat for next day
                test_df.loc[mask, 'cc_cases'] = next_cc_scaled
                test_df.loc[mask, 'ft_cases'] = next_ft_scaled
                        
        pbar.update(1)
        
print('Time spent for building features is {} minutes'.format(round((time.time()-start_time)/60,1)))",No,2,8.0
"submission = pd.DataFrame()
submission['ForecastId'] = test_df['ForecastId']
submission['ConfirmedCases'] = test_df['cc_cases']
submission['Fatalities'] = test_df['ft_cases']",No,5,12.0
"submission.to_csv(""submission.csv"",index=False)",No,5,25.0
submission[:20],No,5,41.0
"import warnings
warnings.filterwarnings('ignore')",No,5,23.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns",No,5,22.0
"df_train = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
df_test = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
df_train.shape, df_test.shape",No,4,45.0
train_revenue = df_train.pop('revenue'),No,5,10.0
df_train.isnull().sum().sort_values(ascending=False),No,5,39.0
df_test.isnull().sum().sort_values(ascending=False),No,5,39.0
df_train['Open Date'] = df_train['Open Date'].str.split('/').apply(lambda x : x[2]),No,5,8.0
df_test['Open Date'] = df_test['Open Date'].str.split('/').apply(lambda x : x[2]),No,5,8.0
"df_train.shape, df_test.shape",No,5,58.0
"df_train.drop(columns=[""Id""],inplace=True)
df_test_index = df_test.pop('Id')'",No,5,10.0
"from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')",No,3,20.0
"df_train_ohe = ohe.fit_transform(df_train)
df_train_ohe = df_train_ohe.todense()",No,5,20.0
"df_test_ohe = ohe.transform(df_test)
df_test_ohe = df_test_ohe.toarray()",No,5,20.0
df_train_ohe.shape,No,5,58.0
df_test_ohe.shape,No,5,58.0
"from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV",No,5,22.0
"param = {
    ""n_estimators"": range(10,20,2),
    ""learning_rate"": [0.0001,0.001,0.01,0.1],
    ""loss"" : ['ls', 'lad', 'huber', 'quantile'],
    ""min_samples_split"": range(10,15,2),
    ""min_samples_leaf"": range(10,15,2),
    ""max_depth"": range(10,20,2),
    ""alpha"": [0,0.1,0.3,0.5,0.7,0.9]
}'",No,5,5.0
"gbr = GradientBoostingRegressor(alpha=0.1, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=10,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=10, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=10,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=17, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)",No,5,4.0
"gbr.fit(df_train_ohe,train_revenue)",No,5,7.0
"train_revenue_predict = gbr.predict(df_train_ohe)
test_revenue = gbr.predict(df_test_ohe)",No,4,27.0
"from sklearn.metrics import mean_squared_error

mse = mean_squared_error(train_revenue_predict,train_revenue)
rmse = np.sqrt(mse)
print(rmse)",No,5,28.0
"df_submit = pd.DataFrame({'Id': df_test_index, 'Prediction': test_revenue})",No,5,12.0
"df_submit.to_csv('submit.csv',index=False) 
df_submit.head()",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import mean_squared_error

from sklearn import ensemble

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"data = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
data.head()",No,4,45.0
data.isnull().sum(),No,5,39.0
y_train = data.pop('revenue'),No,5,10.0
"data = data.drop(data.columns[[0, 1,2,3,4]], axis=1)
x_train=data[:]
data.head()",No,4,10.0
"from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(x_train, y_train)",No,5,7.0
"from sklearn.ensemble import GradientBoostingRegressor
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
#learing_rates=[]
train_results = []
test_results = []
#beta=range(.05,1.05,.05)
for eta in learning_rates:
   #learning_rates.append(eta)
   model = GradientBoostingRegressor(learning_rate=eta)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(learning_rates, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('Learning Rate')
plt.show()'",No,5,35.0
"from sklearn.ensemble import GradientBoostingRegressor
n_estimators = [ 1, 2, 4, 8, 16, 32, 64, 100, 200, 500, 1000, 2000]
train_results = []
test_results = []
for estimator in n_estimators:
   model = GradientBoostingRegressor(n_estimators=estimator)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('Number of Estimators')
plt.show()'",No,5,35.0
"from sklearn.ensemble import GradientBoostingRegressor
#min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_splits=[2,3,4,5,6,7,8,9,10]
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
   model = GradientBoostingRegressor(min_samples_split=min_samples_split)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_splits, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('Min Samples Split')
plt.show()'",No,5,84.0
"from sklearn.ensemble import GradientBoostingRegressor
max_depths = [1,2,3,4,5,6,7,8,9,10]
train_results = []
test_results = []
for max_depth in max_depths:
   model = GradientBoostingRegressor(max_depth=max_depth)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('No. of Depth')
plt.show()'",No,5,35.0
"from sklearn.ensemble import GradientBoostingRegressor
#min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
min_samples_leafs = [1,2,3,4,5,6,7,8,9,10]
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
   model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_leafs, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('No. of Samples Leaf')
plt.show()'",No,5,35.0
"from sklearn.ensemble import GradientBoostingRegressor
max_features = list(range(1,data.shape[1]))
train_results = []
test_results = []
for max_feature in max_features:
   model = GradientBoostingRegressor(max_features=max_feature)
   model.fit(x_train, y_train)
   from sklearn.metrics import mean_squared_error, r2_score
   model_score = model.score(x_train,y_train)
# Have a look at R sq to give an idea of the fit ,
# Explained variance score: 1 is perfect prediction
   y_predicted_train=model.predict(x_train)
   train_results.append(mean_squared_error(y_train, y_predicted_train))
   print('R2 sq: ',model_score)",No,5,2.0
"from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results, 'b', label=""Training MSE"")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Mean Squared Error')
plt.xlabel('No. of Feature')
plt.show()'",No,5,35.0
"data1 = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
data1.head()",No,4,45.0
"data1=data1.drop(data1.columns[[0, 1,2,3,4]], axis=1)",No,5,10.0
data1.head(),No,5,41.0
sample =pd.read_csv('/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv'),No,5,45.0
"params = {'n_estimators': 225, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.25, 'loss': 'ls','max_features':10,
             'min_samples_leaf':2}
model = ensemble.GradientBoostingRegressor(**params)
model.fit(x_train, y_train)
sample[""Prediction""] = model.predict(data1)'",No,3,4.0
"sample.to_csv('submission.csv', index = False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"
import pandas as pd
import numpy as np

",No,5,22.0
"import pandas as pd
df_train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip')
df_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip')",No,4,45.0
"train_corr = df_train.corr()
print(train_corr)
train_corr.to_csv(""corr.csv"", index = False)",No,4,25.0
"import datetime
from sklearn.preprocessing import LabelEncoder
revenue = df_train[""revenue""]",No,4,22.0
"del df_train[""revenue""]",No,5,10.0
"df_whole = pd.concat([df_train, df_test], axis=0)",No,5,11.0
"df_whole[""Open Date""] = pd.to_datetime(df_whole[""Open Date""])
df_whole[""Year""] = df_whole[""Open Date""].apply(lambda x:x.year)
df_whole[""Month""] = df_whole[""Open Date""].apply(lambda x:x.month)
df_whole[""Day""] = df_whole[""Open Date""].apply(lambda x:x.day)",No,4,8.0
"le = LabelEncoder()
df_whole[""City""] = le.fit_transform(df_whole[""City""])",No,5,20.0
"df_whole[""City Group""] = df_whole[""City Group""].map({""Other"":0, ""Big Cities"":1})",No,5,20.0
"df_whole[""Type""] = df_whole[""Type""].map({""FC"":0, ""IL"":1, ""DT"":2, ""MB"":3})",No,5,20.0
"df_train = df_whole.iloc[:df_train.shape[0]]
df_test = df_whole.iloc[df_train.shape[0]:]",No,5,13.0
"from sklearn.ensemble import RandomForestRegressor

#
df_train_columns = [col for col in df_train.columns if col not in [""Id"", ""Open Date""]]

#RandomForest
rf = RandomForestRegressor(
    n_estimators=200, 
    max_depth=5, 
    max_features=0.5, 
    random_state=449,
    n_jobs=-1
)
rf.fit(df_train[df_train_columns], revenue)'",No,2,7.0
prediction = rf.predict(df_test[df_train_columns]),No,5,48.0
"submission = pd.DataFrame({""Id"":df_test.Id, ""Prediction"":prediction})
submission.to_csv(""TFI_submission.csv"", index=False)",No,4,25.0
"df = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip', 
                 index_col='Id')
df",No,5,45.0
"y = df.revenue
X = df.drop(columns=['revenue'], axis=1)",No,5,21.0
"y
",No,5,41.0
X,No,5,41.0
"for col in X.columns:
    print(col, df[col].dtype)",No,5,70.0
"y.isna().sum()
",No,5,39.0
"X_num = X.select_dtypes(exclude=['object'])
X_num",No,5,84.0
"df.shape
",No,5,58.0
"X_num.shape
",No,5,58.0
"for col in X_num.columns:
    if X_num[col].isna().sum() > 0:
        print(col, X_num[col].isna().sum()   / len(X_num) )",No,5,39.0
"from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
X_num_imputed = pd.DataFrame(imputer.fit_transform(X_num))",No,4,17.0
"parameters = {
    'n_estimators': list(range(100, 1001, 100)), 
    'max_leaf_nodes': list(range(2, 70, 5)), 
    'max_depth': list(range(6, 70, 5))
}
parameters",No,5,5.0
"my_randome_state=1486
",No,5,77.0
"from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
gsearch = GridSearchCV(estimator=RandomForestRegressor(random_state=my_randome_state),
                       param_grid = parameters, 
                       scoring='neg_mean_absolute_error',
                       n_jobs=4,cv=5, verbose=7)",No,4,5.0
"gsearch.fit(X_num_imputed, y)
",No,5,6.0
"best_n_estimators = gsearch.best_params_.get('n_estimators')
best_n_estimators",No,5,2.0
"best_max_leaf_nodes = gsearch.best_params_.get('max_leaf_nodes')
best_max_leaf_nodes",No,5,2.0
"best_max_depth = gsearch.best_params_.get('max_depth')
best_max_depth",No,5,2.0
"final_model = RandomForestRegressor(n_estimators=best_n_estimators, random_state=my_randome_state, 
                          max_leaf_nodes=best_max_leaf_nodes, 
                          max_depth=best_max_depth)",No,5,4.0
"final_model.fit(X_num_imputed, y)
",No,5,7.0
"X_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip', 
                     index_col='Id')
X_test ",No,5,45.0
"X_test_num = X_test.select_dtypes(exclude=['object'])
",No,5,14.0
"for col in X_test_num.columns:
    if X_test_num[col].isna().sum() > 0:
        print(col, X_test_num[col].isna().sum()   / len(X_test_num) )",No,5,39.0
"X_test_num_imputed = pd.DataFrame(imputer.transform(X_test_num))
X_test_num_imputed.columns = X_test_num.columns
X_test_num_imputed",No,5,61.0
"preds_test = final_model.predict(X_test_num_imputed)
",No,5,48.0
"len(preds_test)
",No,5,40.0
"# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'Prediction': preds_test})
output",No,5,55.0
"output.to_csv('submission.csv', index=False)
print('done!')",No,5,25.0
"data_train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip',index_col='Id', parse_dates=[""Open Date""])
data_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip',index_col='Id', parse_dates=[""Open Date""])
data_train'",No,4,45.0
data_train.describe(),No,5,40.0
data_train.isnull().sum(),No,5,39.0
"for i in data_train.columns:    
    print(i ,': ',len(data_train[i].unique()))",No,5,54.0
"columnsForDrop = ['Open Date']
data_train.drop(columns=columnsForDrop, inplace=True)
################################
data_test.drop(columns=columnsForDrop, inplace=True)

data_train",No,4,10.0
"s = (data_train.dtypes == 'object')
object_cols = list(s[s].index)

print(""Categorical variables:"")
print(object_cols)'",No,4,37.0
"from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = data_train.copy()
label_X_test = data_test.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_encoder.fit(pd.concat([data_train[col], data_test[col]], axis=0, sort=False))
    label_X_train[col] = label_encoder.transform(data_train[col])
    label_X_test[col] = label_encoder.transform(data_test[col])",No,5,20.0
"data_train = label_X_train
data_test = label_X_test
data_train",No,4,41.0
"y = data_train.revenue
############################################
X = data_train.drop(columns=['revenue'])",No,5,21.0
"from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier",No,5,22.0
"from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error ,explained_variance_score, mean_squared_error
#########################################################################3
from sklearn.ensemble import RandomForestRegressor

parameters = {'max_depth':  list(range(6, 30, 10)),
              'max_leaf_nodes': list(range(50, 500, 100)),
              'n_estimators': list(range(50, 1001, 150))}

parameters1 = {'max_depth':  [6],
              'max_leaf_nodes': [250],
              'n_estimators': [100]}
from sklearn.model_selection import GridSearchCV

gsearch = GridSearchCV(estimator=RandomForestRegressor(),
                       param_grid = parameters, 
                       scoring='neg_mean_squared_error',
                       n_jobs=4,cv=5,verbose=7)

gsearch.fit(X, y)",No,4,6.0
"print(gsearch.best_params_.get('n_estimators'))
print(gsearch.best_params_.get('max_leaf_nodes'))
print(gsearch.best_params_.get('max_depth'))",No,5,2.0
"print(data_train.shape)
print(data_test.shape)
print(X.shape)",No,5,58.0
"final_model = RandomForestRegressor(
                         max_depth = gsearch.best_params_.get('max_depth'),
                           max_leaf_nodes = gsearch.best_params_.get('max_leaf_nodes'),
    n_estimators = gsearch.best_params_.get('n_estimators'),random_state=1, n_jobs=4)
final_model.fit(X, y)",No,4,6.0
"preds = final_model.predict(data_test)
print(preds.shape)
print(data_test.shape)",No,4,48.0
"testData = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv.zip"")
submission = pd.DataFrame({
        ""Id"": testData[""Id""],
        ""Prediction"": preds
    })
submission.to_csv('RandomForestSimple.csv',header=True, index=False)
print('Done')'",No,4,25.0
"df_train=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"",index_col='Id')
X_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip',index_col='Id')
df_train.shape,X_test.shape
'",No,4,45.0
"train_with_missing = [col for col in df_train.columns if df_train[col].isnull().any()] 
test_with_missing = [col for col in X_test.columns if X_test[col].isnull().any()] 
train_with_missing,test_with_missing",No,3,17.0
"y_train = df_train.revenue
X_train = df_train.drop(columns=['revenue'], axis=1)",No,5,21.0
"X_train.shape,X_test.shape",No,5,58.0
"b""import matplotlib.pyplot as plt\n\nd_names = ('train.csv.zip', 'test.csv.zip')\ny_pos = range(len(d_names))\n \nplt.bar(\n    y_pos, \n    (X_train.shape[0], X_test.shape[0]), \n    align='center', \n    alpha=0.8\n)\nplt.xticks(y_pos, d_names)\nplt.ylabel('Number of rows') \nplt.title(' Wow!')\nplt.show()""",No,4,33.0
"bad_label_cols = list(set(X_train.columns)-set(X_test.columns))
bad_label_cols",No,4,37.0
X_train['City'].value_counts(),No,5,72.0
X_train.Type.value_counts(),No,5,72.0
"X_train['year'] = pd.DatetimeIndex(X_train['Open Date']).year
X_train.drop(columns=['Open Date','City'],inplace=True)",No,4,10.0
"X_test['year'] = pd.DatetimeIndex(X_test['Open Date']).year
X_test.drop(columns=['Open Date','City'],inplace=True)",No,4,10.0
"from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)",Yes,3,20.0
OH_X_train.head(),No,5,41.0
OH_X_test.head(),No,5,41.0
"train_stats=OH_X_train.describe().transpose()
train_stats ",No,5,40.0
"def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(OH_X_train)
normed_test_data = norm(OH_X_test)",No,5,18.0
normed_test_data.head(),No,5,41.0
"parameters = {
    'n_estimators': list(range(10, 300, 20)), 
    'learning_rate': [l / 100 for l in range(5, 100, 20)], 
    'max_depth': list(range(1, 20,3)),
    'gamma':[2,3],
    'eta':[0.8,0.9],
    'reg_alpha':[0.5,0.6,0.7,0.8],
    'reg_lambda':[0.5,0.6,0.7,0.8]
}
parameters",No,5,5.0
my_randome_state=70,No,5,77.0
"from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
gsearch = GridSearchCV(estimator=XGBRegressor(random_state=my_randome_state),
                       param_grid = parameters, 
                       scoring='neg_root_mean_squared_error',
                       n_jobs=4,cv=5, verbose=7)
gsearch.fit(normed_train_data, y_train)",No,4,6.0
"best_n_estimators = gsearch.best_params_.get('n_estimators')

best_learning_rate = gsearch.best_params_.get('learning_rate')

best_max_depth = gsearch.best_params_.get('max_depth')

best_max_gamma = gsearch.best_params_.get('gamma')

best_max_eta = gsearch.best_params_.get('eta')

best_max_reg_alpha = gsearch.best_params_.get('reg_alpha')

best_max_reg_lambda = gsearch.best_params_.get('reg_lambda')


best_max_depth,best_n_estimators,best_learning_rate,best_max_gamma,best_max_eta,best_max_reg_alpha,best_max_reg_lambda",No,5,2.0
"final_model = XGBRegressor(n_estimators=best_n_estimators, 
                           random_state=my_randome_state, 
                           learning_rate=best_learning_rate, 
                           max_depth=best_max_depth,
                           gamma=best_max_gamma,
                           eta=best_max_eta,
                           reg_alpha=best_max_reg_alpha,
                          reg_lambda=best_max_reg_lambda)
final_model.fit(normed_train_data, y_train)",No,4,7.0
preds_test = final_model.predict(normed_test_data),No,5,48.0
"# zip
import zipfile

with zipfile.ZipFile(""/kaggle/input/restaurant-revenue-prediction/test.csv.zip"") as zf:
    zf.extractall()
with zipfile.ZipFile(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"") as zf:
    zf.extractall()
     '",No,3,44.0
"# 
df_train = pd.read_csv(""train.csv"")
df_test = pd.read_csv(""test.csv"")
df_submission = pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv"")'",No,5,45.0
df_test,No,5,41.0
"# 
corrmat = df_train.corr()
# corrmat'",No,5,40.0
"# 

y_train = df_train[""revenue""]
del df_train[""revenue""]'",No,4,21.0
"# 

df_all = pd.concat([df_train, df_test], axis=0)   # axis=0 : '",No,5,11.0
"# OpenDate

# timestamp
df_all[""Open Date""] = pd.to_datetime(df_all[""Open Date""])

df_all[""Year""] = df_all[""Open Date""].dt.year
df_all[""Month""] = df_all[""Open Date""].dt.month
df_all[""Day""] = df_all[""Open Date""].dt.day'",No,4,8.0
"dftrain['Open Date'] = dftrain['Open Date'].apply(lambda x: pd.to_datetime(x))
dftest['Open Date'] = dftest['Open Date'].apply(lambda x: pd.to_datetime(x))
def calc_days(dtime):
    now_time = pd.to_datetime('2015-01-01')
    result = now_time - dtime
    return int(result.total_seconds()/3600//24)
dftrain['days_since_open'] = dftrain['Open Date'].apply(lambda x: calc_days(x))
dftest['days_since_open'] = dftest['Open Date'].apply(lambda x: calc_days(x))",No,3,8.0
dftrain['revenue'] = dftrain['revenue'].astype(int),No,5,16.0
"
for col in dftrain.columns:
    if (dftrain[col].dtype == int) | (dftrain[col].dtype == float):
        if col not in ['P2','P3', 'P7','revenue']:
            dftrain[col] = dftrain[col].map(lambda x:np.log1p(x))
            print (col)
            dftest[col] = dftest[col].map(lambda x: np.log1p(x))
            ",No,5,20.0
pd.get_dummies(dftrain).columns,No,5,71.0
"#TFI (tab food investments) has provided a dataset with 137 restaurants in the training set, and a test set of 100000 restaurants..
data =  pd.read_csv('../input/restaurant-revenue-prediction/train.csv')
test_data = pd.read_csv('../input/restaurant-revenue-prediction/test.csv')",No,5,45.0
data.describe(),No,5,40.0
data.head(),No,5,41.0
pd.get_dummies(dftest).columns[50:],No,5,53.0
"#log transforming dftrain revenue
dftrain['revenue']= dftrain.revenue.apply(lambda x: np.log1p(x))",No,5,8.0
"dftrain1 = pd.get_dummies(dftrain, drop_first = True)
dftest1 = pd.get_dummies(dftest,drop_first = True)",No,5,20.0
"cat_cols = ['City', 'Open Date', 'City']",No,3,10.0
final_pred = model.predict(dftest1),No,2,10.0
"sampledf = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv')
",No,2,7.0
sampledf.head(),No,5,41.0
sub = pd.DataFrame(),No,5,12.0
"sub['Id'] = sampledf.Id
sub['Prediction'] = np.expm1(final_pred)",No,5,55.0
"sub.to_csv('submission.csv', index = False)",No,5,25.0
"train = pd.read_csv(""../input/restaurant-revenue-prediction/train.csv.zip"")",No,5,45.0
train.shape,No,5,58.0
"train[""City""].value_counts()",No,5,72.0
"train[""City""] = (train[""City""]==""stanbul"").astype(np.int)'",No,5,16.0
"train[""City Group""].value_counts()",No,5,72.0
"train[""City Group""] = l.fit_transform(train[""City Group""])",No,3,16.0
"train[""Type""].value_counts()",No,5,72.0
"train[""Type""] = l.fit_transform(train[""Type""])",No,5,20.0
"train[""year""]=0
for i in range(len(train[""Open Date""])):
       a=train[""Open Date""][i].split(""/"")
       train[""year""][i]=a[2]

train[""month""]=0
for i in range(len(train[""Open Date""])):
       a=train[""Open Date""][i].split(""/"")
       train[""month""][i]=a[0]
    
train[""day_No""]=0
for i in range(len(train[""Open Date""])):
       a=train[""Open Date""][i].split(""/"")
       train[""day_No""][i]=a[1]
",No,5,8.0
"test = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv.zip"")",No,5,45.0
test.shape,No,5,58.0
"test[""City""] = (test[""City""]==""stanbul"").astype(np.int)
test[""Type""] = l.fit_transform(test[""Type""])
test[""City Group""] = l.fit_transform(test[""City Group""])'",No,4,16.0
"test[""year""]=0
for i in range(len(test[""Open Date""])):
       a=test[""Open Date""][i].split(""/"")
       test[""year""][i]=a[2]

test[""month""]=0
for i in range(len(test[""Open Date""])):
       a=test[""Open Date""][i].split(""/"")
       test[""month""][i]=a[0]
    
test[""day_No""]=0
for i in range(len(test[""Open Date""])):
       a=test[""Open Date""][i].split(""/"")
       test[""day_No""][i]=a[0]
",No,5,8.0
"x_train = train.drop(columns=[""Id"",""revenue"",""Open Date""],axis=1)
y_train = train[""revenue""]
x_test = test.drop(columns=[""Id"",""Open Date""],axis=1)
x_train.shape,x_test.shape,y_train.shape",No,5,21.0
"xtrain = s.fit_transform(x_train)
x_train = pd.DataFrame(x_train,columns=x_train.columns)
xtest = s.fit_transform(x_test)
x_test = pd.DataFrame(x_test,columns=x_test.columns)",No,4,18.0
"x_train.shape,x_test.shape,y_train.shape",No,5,58.0
"from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)",No,1,48.0
"s = pd.read_csv(""../input/restaurant-revenue-prediction/sampleSubmission.csv"")",No,5,45.0
s.head(),No,5,41.0
f = pd.DataFrame(f),No,5,12.0
"f.to_csv(""submission.csv"",index=False)",No,5,25.0
f.head(),No,5,41.0
"df=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"")
df.head()",No,3,45.0
df.describe(),No,5,40.0
"import matplotlib.pyplot as plt
import seaborn as sns",No,5,22.0
df.columns,No,5,71.0
"df=df.drop('Id',axis=1)",No,5,10.0
"df.columns
",No,5,71.0
"df['Open Date']=pd.to_datetime(df['Open Date'])
df",No,5,16.0
"df['month']=[x.month for x in df['Open Date']]
df['year']=[x.year for x in df['Open Date']]",No,5,8.0
"df
",No,5,41.0
df.groupby('month')['revenue'].mean(),No,5,60.0
"sns.barplot('month','revenue',data=df)",No,5,33.0
"df=df.drop('Open Date',axis=1)
df['Type'].value_counts()
ty={'FC':0,'IL':1,'DT':2}
df['Type']=df['Type'].map(ty)",No,3,8.0
df['City Group'].value_counts(),No,3,16.0
"cg={'Big Cities':0,'Other':1}
df['City Group']=df['City Group'].map(cg)",No,5,20.0
df['City'].value_counts(),No,5,72.0
"x=0
c={}
for i in df['City'].unique():
    c.update({i:x})
    x=x+1
",No,5,53.0
c,No,5,53.0
"from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor",No,5,22.0
"from sklearn.metrics import mean_absolute_error ,mean_squared_error,r2_score",No,5,22.0
"dr=DecisionTreeRegressor()
dr=dr.fit(X_train,Y_train)
pred=dr.predict(X_test)
s=mean_absolute_error(Y_test,pred)
s1=mean_squared_error(Y_test,pred)
s2=r2_score(Y_test,pred)
print(""The MAE with the DecisionTreeRegressor is: ""+str(s))
print(""The MsE with the DecisionTreeRegressor is: ""+str(s1))
print(""The R2_Score with the DecisionTreeRegressor is: ""+str(s2))
",No,4,49.0
"r=RandomForestRegressor()
r=r.fit(X_train,Y_train)
pred=r.predict(X_test)
s=mean_absolute_error(Y_test,pred)
s1=mean_squared_error(Y_test,pred)
s2=r2_score(Y_test,pred)
print(""The MAE with the RandomForestRegressor is: ""+str(s))
print(""The MsE with the RandomForestRegressor is: ""+str(s1))
print(""The R2_Score with the RandomForestRegressor is: ""+str(s2))",No,4,49.0
"x=XGBRegressor()
x=dr.fit(X_train,Y_train)
pred=x.predict(X_test)
s=mean_absolute_error(Y_test,pred)
s1=mean_squared_error(Y_test,pred)
s2=r2_score(Y_test,pred)
print(""The MAE with the XGBRegressor is: ""+str(s))
print(""The MsE with the XGBRegressor is: ""+str(s1))
print(""The R2_Score with the XGBRegressor is: ""+str(s2))",Yes,3,7.0
"df_t=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/test.csv.zip"")
df_t.head()
i_d=df_t['Id']
df_t=df_t.drop('Id',axis=1)'",Yes,3,45.0
df_t['Open Date']=pd.to_datetime(df_t['Open Date']),Yes,5,16.0
"df_t['month']=[x.month for x in df_t['Open Date']]
df_t['year']=[x.year for x in df_t['Open Date']]",Yes,5,8.0
"df_t=df_t.drop('Open Date',axis=1)
df_t['Type'].value_counts()
ty={'FC':0,'IL':1,'DT':2}
df_t['Type']=df_t['Type'].map(ty)
cg={'Big Cities':0,'Other':1}
df_t['City Group']=df_t['City Group'].map(cg)
x=0
c={}
for i in df_t['City'].unique():
    c.update({i:x})
    x=x+1
df_t['City']=df_t['City'].map(c)",Yes,5,8.0
df_t.head(),No,2,45.0
df_t.dropna,No,5,17.0
df_t['Type']=df_t['Type'].fillna(0),No,5,17.0
df_t.info(),No,5,40.0
"
p=k.predict(df_t)
",No,5,48.0
"sub=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv"")
sub['Id']=i_d'",No,5,45.0
"sub.to_csv(""Submission1.csv"",index=False)",No,5,25.0
"b""import pandas as pd\n\ntrain = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')\ntest = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')\n\n# Id\ntrain_Id = train.Id\ntest_Id = test.Id\n\n# Id\ntrain.drop('Id', axis=1, inplace=True)\ntest.drop('Id', axis=1, inplace=True)""",No,3,45.0
"#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import seaborn as sns; sns.set(style=""ticks"", color_codes=True)

from datetime import datetime
from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# 100()
# 
pd.options.display.max_columns = None
pd.options.display.max_rows = 80

# 2()
pd.options.display.float_format = '{:.2f}'.format
%matplotlib inline
#
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline'",No,4,23.0
"print('Size of train data', train.shape)
print('Size of test data', test.shape)",No,5,58.0
train.describe(include='O'),No,5,40.0
"train[""revenue""].describe()",No,5,40.0
"#revenueQ-Q
# 
fig = plt.figure(figsize=(10, 4))
plt.subplots_adjust(wspace=0.4)

# 
ax = fig.add_subplot(1, 2, 1)
sns.distplot(train['revenue'], ax=ax)

# QQ
ax2 = fig.add_subplot(1, 2, 2)
stats.probplot(train['revenue'], plot=ax2)

plt.show()

# 
print(train['revenue'].describe())
print(""------------------------------"")
print("": %f"" % train['revenue'].skew())
print("": %f"" % train['revenue'].kurt())'",No,3,33.0
"# dataframe
df = train.copy()

#log(x+1)
df['revenue'] = np.log1p(df['revenue'])

# (0, 1)
scaler=StandardScaler()
df['revenue']=scaler.fit_transform(df[['revenue']])

# 
fig = plt.figure(figsize=(10, 4))
plt.subplots_adjust(wspace=0.4)

# 
ax = fig.add_subplot(1, 2, 1)
sns.distplot(df['revenue'], ax=ax)

# QQ
ax2 = fig.add_subplot(1, 2, 2)
stats.probplot(df['revenue'], plot=ax2)

plt.show()

# 
print(df['revenue'].describe())
print(""------------------------------"")
print("": %f"" % df['revenue'].skew())
print("": %f"" % df['revenue'].kurt())'",Yes,2,33.0
"# dataframe
df = train.copy()

# (0, 1)
scaler=StandardScaler()
df['revenue']=scaler.fit_transform(df[['revenue']])


# 
fig = plt.figure(figsize=(10, 4))
plt.subplots_adjust(wspace=0.4)

# 
ax = fig.add_subplot(1, 2, 1)
sns.distplot(df['revenue'], ax=ax)

# QQ
ax2 = fig.add_subplot(1, 2, 2)
stats.probplot(df['revenue'], plot=ax2)

plt.show()

# 
print(df['revenue'].describe())
print(""------------------------------"")
print("": %f"" % df['revenue'].skew())
print("": %f"" % df['revenue'].kurt())'",Yes,5,33.0
"# dataframe
df = train.copy()

# Min-Max((1, 0))
scaler=MinMaxScaler()
df['revenue']=scaler.fit_transform(df[['revenue']])

# 
fig = plt.figure(figsize=(10, 4))
plt.subplots_adjust(wspace=0.4)

# 
ax = fig.add_subplot(1, 2, 1)
sns.distplot(df['revenue'], ax=ax)

# QQ
ax2 = fig.add_subplot(1, 2, 2)
stats.probplot(df['revenue'], plot=ax2)

plt.show()

# 
print(df['revenue'].describe())
print(""------------------------------"")
print("": %f"" % df['revenue'].skew())
print("": %f"" % df['revenue'].kurt())'",Yes,3,33.0
"b""# \n# Open Date\ntrain['pd_date'] = pd.to_datetime(train['Open Date'], format='%m/%d/%Y')\n# \ntrain['Open_Year'] = train['pd_date'].dt.strftime('%Y')\n# \ntrain['Open_Month'] = train['pd_date'].dt.strftime('%m')\n\ntrain = train.drop('pd_date',axis=1)\ntrain = train.drop('Open Date',axis=1)""",Yes,3,8.0
"b""# \n# Open Date\ntest['pd_date'] = pd.to_datetime(test['Open Date'], format='%m/%d/%Y')\n# \ntest['Open_Year'] = test['pd_date'].dt.strftime('%Y')\n# \ntest['Open_Month'] = test['pd_date'].dt.strftime('%m')\n\ntest = test.drop('pd_date',axis=1)\ntest = test.drop('Open Date',axis=1)""",Yes,3,8.0
train.dtypes.value_counts(),No,5,72.0
"b""#\ncats = list(train.select_dtypes(include=['object']).columns)\nnums = list(train.select_dtypes(exclude=['object']).columns)\nprint(f'categorical variables:  {cats}')\nprint(f'numerical variables:  {nums}')""",No,3,37.0
train.nunique(axis=0),No,5,54.0
"columns = len(nominal_list)/2+1

fig = plt.figure(figsize=(30, 20))
plt.subplots_adjust(hspace=0.6, wspace=0.4)

for i in range(len(nominal_list)):
    ax = fig.add_subplot(columns, 2, i+1)
    sns.countplot(x=nominal_list[i], data=train, ax=ax)
    plt.xticks(rotation=45)
plt.show()",No,5,33.0
"columns = len(num_list)/3+1

fig = plt.figure(figsize=(30, 40))
plt.subplots_adjust(hspace=0.6, wspace=0.4)

for i in range(len(num_list)):
    ax = fig.add_subplot(columns, 3, i+1)

    train[num_list[i]].hist(ax=ax)
    ax2 = train[num_list[i]].plot.kde(ax=ax, secondary_y=True,title=num_list[i])
    ax2.set_ylim(0)
    
plt.show()",No,5,33.0
"columns = len(nominal_list)/2+1

fig = plt.figure(figsize=(20, 10))
plt.subplots_adjust(hspace=0.6, wspace=0.4)

for i in range(len(nominal_list)):
    ax = fig.add_subplot(columns, 2, i+1)

    #     
    sns.boxplot(x=nominal_list[i], y=train.revenue, data=train, ax=ax)
    plt.xticks(rotation=45)
    # 
#     sns.barplot(x = nominal_list[i], y = train.revenue, data=train, ax=ax)
plt.show()
'",No,5,33.0
"train = train.drop('Open_Month',axis=1)
test= test.drop('Open_Month',axis=1)
nominal_list.remove('Open_Month')",No,5,10.0
"b""columns = len(num_list)/4+1\n\nfig = plt.figure(figsize=(30, 35))\nplt.subplots_adjust(hspace=0.6, wspace=0.4)\n\nfor i in range(len(num_list)):\n    ax = fig.add_subplot(columns, 4, i+1)\n\n    #     \n    sns.regplot(x=num_list[i],y='revenue',data=train, ax=ax)\n    plt.xticks(rotation=45)\n    # \n#     sns.barplot(x = nominal_list[i], y = train.revenue, data=train, ax=ax)\nplt.show()\n""",No,5,33.0
"train[['City','revenue']].groupby('City').mean().plot(kind='bar')
plt.title('Mean Revenue Generated vs City')
plt.xlabel('City')
plt.ylabel('Mean Revenue Generated')",No,5,33.0
"b""# Cityrevenue1000000\nmean_revenue_per_city = train[['City', 'revenue']].groupby('City', as_index=False).mean()\nmean_revenue_per_city.head()\nmean_revenue_per_city['revenue'] = mean_revenue_per_city['revenue'].apply(lambda x: int(x/1e6)) \n\nmean_revenue_per_city\n\nmean_dict = dict(zip(mean_revenue_per_city.City, mean_revenue_per_city.revenue))\nmean_dict""",No,3,60.0
"
print(train['City'].sort_values().unique())",No,5,57.0
"test['City'].sort_values().unique()
",No,5,57.0
"b""# City\ncity_train_list = list(train['City'].unique())\ncity_test_list = list(test['City'].unique())""",No,5,57.0
"# P1
# PP
distinct_cities = train.loc[:, ""City""].unique()

# Pcity
means = []
for i in range(len(num_list)):
    temp = []
    for city in distinct_cities:
        temp.append(train.loc[train.City == city, num_list[i]].mean())  
    means.append(temp)
    
city_pvars = pd.DataFrame(columns=[""city_var"", ""means""])
for i in range(37):
    for j in range(len(distinct_cities)):
        city_pvars.loc[i+37*j] = [""P""+str(i+1), means[i][j]]

print(city_pvars)            
# 
plt.rcParams['figure.figsize'] = (18.0, 6.0)
sns.boxplot(x=""city_var"", y=""means"", data=city_pvars)

# From this we observe that P1, P2, P11, P19, P20, P23, and P30 are approximately a good
# proxy for geographical location.",No,5,53.0
"from sklearn import cluster

def adjust_cities(full_full_data, train, k):
    
    # As found by box plot of each city's mean over each p-var
    relevant_pvars =  [""P1"", ""P2"", ""P11"", ""P19"", ""P20"", ""P23"",""P30""]
    train = train.loc[:, relevant_pvars]
    
    # Optimal k is 20 as found by DB-Index plot    
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(train)
    
    # Get the cluster centers and classify city of each full_data instance to one of the centers
    full_data['City_Cluster'] = kmeans.predict(full_data.loc[:, relevant_pvars])
    
    return full_data'",No,5,84.0
"num_train = train.shape[0]
num_test = test.shape[0]
print(num_train, num_test)

full_data = pd.concat([train, test], ignore_index=True)                ",No,4,11.0
"b""# \nfull_data = adjust_cities(full_data, train, 20)\nfull_data\n\n# City\nfull_data = full_data.drop(['City'], axis=1)""",No,3,8.0
"# Split into train and test datasets
train = full_data[:num_train]
test = full_data[num_train:]
# check the shapes 
print(""Train :"",train.shape)
print(""Test:"",test.shape)
test",No,4,13.0
"train[['City_Cluster','revenue']].groupby('City_Cluster').mean().plot(kind='bar')
plt.title('Mean Revenue Generated vs City Cluster')
plt.xlabel('City Cluster')
plt.ylabel('Mean Revenue Generated')",No,5,33.0
"mean_revenue_per_city = train[['City_Cluster', 'revenue']].groupby('City_Cluster', as_index=False).mean()
mean_revenue_per_city.head()
mean_revenue_per_city['revenue'] = mean_revenue_per_city['revenue'].apply(lambda x: int(x/1e6)) 

mean_revenue_per_city

mean_dict = dict(zip(mean_revenue_per_city.City_Cluster, mean_revenue_per_city.revenue))
mean_dict",No,2,60.0
"b""city_rev = []\n\nfor i in full_data['City_Cluster']:\n    for key, value in mean_dict.items():\n        if i == key:\n            city_rev.append(value)\n            \ndf_city_rev = pd.DataFrame({'city_rev':city_rev})\nfull_data = pd.concat([full_data,df_city_rev],axis=1)\nfull_data.head\n\n# \nnominal_list.extend(['City_Cluster'])\n# \nnominal_list.remove('City')\n""",No,5,53.0
"from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_count = 0

# Iterate through the columns
# for col in application_full_data:
for i in range(len(nominal_list)):    
    
#     if application_full_data[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(full_data[nominal_list[i]].unique())) <= 2:
            # full_data on the full_dataing data
            le.fit(full_data[nominal_list[i]])
            # Transform both full_dataing and testing data
            full_data[nominal_list[i]] = le.transform(full_data[nominal_list[i]])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)",No,5,20.0
"# one-hot encoding of categorical variables
full_data = pd.get_dummies(full_data)
print('full_dataing Features shape: ', full_data.shape)",No,4,20.0
"# 
# for col in num_list:
#     outliers = tukey_outliers(train[col])
#     if len(outliers):
#         print(f""* {col} has these tukey outliers,\
{outliers}\
"")
#     else:
#         print(f""* {col} doesn't have any tukey outliers.\
"")'",No,5,53.0
"columns = len(num_list)/4+1

# boxplot
fig = plt.figure(figsize=(15,20))
plt.subplots_adjust(hspace=0.2, wspace=0.8)
for i in range(len(num_list)):
    ax = fig.add_subplot(columns, 4, i+1)
    sns.boxplot(y=full_data[num_list[i]], data=full_data, ax=ax)
plt.show()",No,5,33.0
"skewed_data = train[num_list].apply(lambda x: skew(x)).sort_values(ascending=False)
skewed_data[:10]",No,3,47.0
"# Split into train and test datasets
train = full_data[:num_train]
test = full_data[num_train:]
# check the shapes 
print(""Train :"",train.shape)
print(""Test:"",test.shape)",No,4,13.0
"sns.set(font_scale=1.1)
correlation_train = train.corr()
mask = np.triu(correlation_train.corr())
fig = plt.figure(figsize=(50,50))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.1f',
            cmap='coolwarm',
            square=True,
#             mask=mask,
            linewidths=1)

plt.show()",No,5,80.0
"# 10
train = train[cols]

#
train_X = train.drop(""revenue"",axis=1)
train_y = train[""revenue""]

#revenue 
train_y = np.log1p(train_y)

# 
tmp_cols = train_X.columns
test_X = test[tmp_cols]

#
print(""train_X: ""+str(train_X.shape))
print(""train_y: ""+str(train_y.shape))
print(""test_X: ""+str(test_X.shape))'",No,3,13.0
"#
from sklearn.model_selection import train_test_split

# 
(X_train, X_test, y_train, y_test) = train_test_split(train_X, train_y , test_size = 0.3 , random_state = 0)

print(""X_train: ""+str(X_train.shape))
print(""X_test: ""+str(X_test.shape))
print(""y_train: ""+str(y_train.shape))
print(""y_test: ""+str(y_test.shape))'",No,4,13.0
"from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor",No,5,22.0
"#
random_state = 2
classifiers = []
classifiers.append(Lasso(random_state=random_state))
classifiers.append(LinearRegression())
classifiers.append(Ridge(random_state=random_state))
classifiers.append(ElasticNet(random_state=random_state))
classifiers.append(KNeighborsRegressor())
classifiers.append(SVR())
classifiers.append(RandomForestRegressor(random_state=random_state))
classifiers.append(GradientBoostingRegressor())
classifiers.append(AdaBoostRegressor(random_state = random_state))
classifiers.append(DecisionTreeRegressor())
classifiers.append(XGBRegressor())'",No,5,4.0
"#classifier 
cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y_train, scoring='neg_mean_squared_error', cv =10, n_jobs=4))

#classifier    
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({""CrossValMeans"":cv_means,""CrossValerrors"": cv_std,""Algorithm"":[""Lasso"",""LinearRegression"",""Ridge"",
""ElasticNet"",""KNeighborsRegressor"",""SVR"",""RandomForestRegressor"",""GradientBoostingRegressor"",""AdaBoostRegressor"",""DecisionTreeRegressor"", ""XGBRegressor""]})'",No,3,28.0
"g = sns.barplot(""CrossValMeans"",""Algorithm"",data = cv_res, palette=""Set3"",orient = ""h"",**{'xerr':cv_std})
g.set_xlabel(""Mean Accuracy"")
g = g.set_title(""Cross validation scores"")'",No,5,84.0
"cv_res.sort_values(ascending=False, by='CrossValMeans')",No,5,9.0
"from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import optuna
 
def objective(trial):
    params = {
        'alpha': trial.suggest_loguniform(""alpha"", 0.1, 5), 
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'normalize': trial.suggest_categorical('normalize', [True, False]),
    }
 
    reg = Ridge(**params)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
 
    mae = mean_absolute_error(y_test, y_pred)
    return mae
 '",No,3,7.0
"b""# optuna \nstudy = optuna.create_study()\nstudy.optimize(objective, n_trials=100)\n\n# \nprint(f'best score: {study.best_value:.4f}, best params: {study.best_params}')""",No,4,2.0
"params = {'alpha': 1.9510706324753746, 'fit_intercept': True, 'normalize': True}

reg = Ridge(**params)
reg.fit(X_train, y_train)
prediction_log = reg.predict(test_X)
prediction =np.exp(prediction_log) 
print(prediction)",No,3,48.0
"# CSV(submission)
submission = pd.DataFrame({""Id"":test_Id, ""Prediction"":prediction})
submission.to_csv(""submission.csv"", index=False)'",No,3,25.0
"%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from scipy import stats
from scipy.stats import norm, skew
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,3,22.0
"df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
df.shape",No,4,45.0
"test_df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
test_df.shape",No,5,45.0
df.head(),No,5,41.0
"def display_all(df):
    with pd.option_context(""display.max_rows"", 1000, ""display.max_columns"", 1000): 
        display(df)

display_all(df.head().transpose())",No,5,41.0
df.isnull().sum().sort_index()/len(df),No,5,39.0
"fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(df['Type'],palette=""Set2"", ax=ax[0]);
g2 = sns.countplot(test_df['Type'],palette=""Set2"", ax=ax[1]);
fig.show()'",No,5,33.0
"fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(df['City Group'],palette=""Set2"", ax=ax[0]);
g2 = sns.countplot(test_df['City Group'],palette=""Set2"", ax=ax[1]);
fig.show()'",No,5,33.0
"(df['City'].nunique(), test_df['City'].nunique())",No,5,54.0
"test_df.loc[test_df['Type']=='MB', 'Type'] = 'DT'",No,5,8.0
"df.drop('City', axis=1, inplace=True)
test_df.drop('City', axis=1, inplace=True)",No,5,10.0
"import datetime
df.drop('Id',axis=1,inplace=True)
df['Open Date']  = pd.to_datetime(df['Open Date'])
test_df['Open Date']  = pd.to_datetime(test_df['Open Date'])
launch_date = datetime.datetime(2015, 3, 23)
# scale days open
df['Days Open'] = (launch_date - df['Open Date']).dt.days / 1000
test_df['Days Open'] = (launch_date - test_df['Open Date']).dt.days / 1000
df.drop('Open Date', axis=1, inplace=True)
test_df.drop('Open Date', axis=1, inplace=True)",Yes,3,8.0
"plt.rc('figure', max_open_warning = 0)
for i in range(1,38):
    fig, ax = plt.subplots(1,2, figsize=(19, 5))
    g1 = sns.distplot(df['P{}'.format(i)], ax=ax[0], kde=False);
    g2 = sns.distplot(test_df['P{}'.format(i)], ax=ax[1], kde=False);
    fig.show()",No,5,33.0
df.dtypes,No,1,37.0
"b""(mu, sigma) = norm.fit(df['revenue'])\nf, (ax1, ax2) = plt.subplots(1, 2, figsize=(19, 5))\nax1 = sns.distplot(df['revenue'] , fit=norm, ax=ax1)\nax1.legend([f'Normal distribution ($\\mu=$ {mu:.3f} and $\\sigma=$ {sigma:.3f})'], loc='best')\nax1.set_ylabel('Frequency')\nax1.set_title('Revenue Distribution')\nax2 = stats.probplot(df['revenue'], plot=plt)\nf.show();""",No,5,33.0
"b""# Revenue is right skewed, taking the log will make it more normally distributed for the linear models\n# Remember to use expm1 on predictions to transform back to dollar amount\n(mu, sigma) = norm.fit(np.log1p(df['revenue']))\nf, (ax1, ax2) = plt.subplots(1, 2, figsize=(19, 5))\nax1 = sns.distplot(np.log1p(df['revenue']) , fit=norm, ax=ax1)\nax1.legend([f'Normal distribution ($\\mu=$ {mu:.3f} and $\\sigma=$ {sigma:.3f})'], loc='best')\nax1.set_ylabel('Frequency')\nax1.set_title('Log(1+Revenue) Distribution')\nax2 = stats.probplot(np.log(df['revenue']), plot=plt)\nf.show();""",No,4,33.0
"# Correlation between numeric features with revenue
plt.figure(figsize=(10, 8))
sns.heatmap(df.drop(['revenue','City Group','Type'], axis=1).corr(), square=True)
plt.suptitle('Pearson Correlation Heatmap')
plt.show();",No,5,80.0
"corr_with_revenue = df.drop(['City Group','Type'],axis=1).corr()['revenue'].sort_values(ascending=False)
plt.figure(figsize=(10,7))
corr_with_revenue.drop('revenue').plot.bar()
plt.show();",No,5,33.0
"sns.pairplot(df[df.corr()['revenue'].sort_values(ascending=False).index[:5]])
plt.show();",No,5,81.0
"# copy_df = df.copy()
# copy_test_df = test_df.copy()
# numeric_features = df.dtypes[df.dtypes != ""object""].index
# skewed_features = df[numeric_features].apply(lambda x: skew(x))
# skewed_features = skewed_features[skewed_features > 0.5].index
# df[skewed_features] = np.log1p(df[skewed_features])
# test_df[skewed_features.drop('revenue')] = np.log1p(test_df[skewed_features.drop('revenue')])
# Above handles skewed features using log transformation
# Below uses multiple imputation for P1-P37, since they are actually categorical
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp_train = IterativeImputer(max_iter=30, missing_values=0, sample_posterior=True, min_value=1, random_state=37)
imp_test = IterativeImputer(max_iter=30, missing_values=0, sample_posterior=True, min_value=1, random_state=23)

p_data = ['P'+str(i) for i in range(1,38)]
df[p_data] = np.round(imp_train.fit_transform(df[p_data]))
test_df[p_data] = np.round(imp_test.fit_transform(test_df[p_data]))'",No,4,17.0
"# drop_first=True for Dummy Encoding for object types, and drop_first=False for OHE
columnsToEncode = df.select_dtypes(include=[object]).columns
df = pd.get_dummies(df, columns=columnsToEncode, drop_first=False)
test_df = pd.get_dummies(test_df, columns=columnsToEncode, drop_first=False)",No,5,20.0
"df['revenue'] = np.log1p(df['revenue'])
X, y = df.drop('revenue', axis=1), df['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=118)",No,3,13.0
"from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso",No,5,22.0
"params_ridge = {
    'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'fit_intercept' : [True, False],
    'normalize' : [True,False],
    'solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge_model = Ridge()
ridge_regressor = GridSearchCV(ridge_model, params_ridge, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
ridge_regressor.fit(X_train, y_train)
print(f'Optimal alpha: {ridge_regressor.best_params_[""alpha""]:.2f}')
print(f'Optimal fit_intercept: {ridge_regressor.best_params_[""fit_intercept""]}')
print(f'Optimal normalize: {ridge_regressor.best_params_[""normalize""]}')
print(f'Optimal solver: {ridge_regressor.best_params_[""solver""]}')
print(f'Best score: {ridge_regressor.best_score_}')'",No,4,6.0
"ridge_model = Ridge(alpha=ridge_regressor.best_params_[""alpha""], fit_intercept=ridge_regressor.best_params_[""fit_intercept""], 
                    normalize=ridge_regressor.best_params_[""normalize""], solver=ridge_regressor.best_params_[""solver""])
ridge_model.fit(X_train, y_train)
y_train_pred = ridge_model.predict(X_train)
y_pred = ridge_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,3,28.0
"# Ridge Model Feature Importance
ridge_feature_coef = pd.Series(index = X_train.columns, data = np.abs(ridge_model.coef_))
ridge_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,3,79.0
"params_lasso = {
    'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'fit_intercept' : [True, False],
    'normalize' : [True,False],
}

lasso_model = Lasso()
lasso_regressor = GridSearchCV(lasso_model, params_lasso, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
lasso_regressor.fit(X_train, y_train)
print(f'Optimal alpha: {lasso_regressor.best_params_[""alpha""]:.2f}')
print(f'Optimal fit_intercept: {lasso_regressor.best_params_[""fit_intercept""]}')
print(f'Optimal normalize: {lasso_regressor.best_params_[""normalize""]}')
print(f'Best score: {lasso_regressor.best_score_}')'",No,4,6.0
"lasso_model = Lasso(alpha=lasso_regressor.best_params_[""alpha""], fit_intercept=lasso_regressor.best_params_[""fit_intercept""], 
                    normalize=lasso_regressor.best_params_[""normalize""])
lasso_model.fit(X_train, y_train)
y_train_pred = lasso_model.predict(X_train)
y_pred = lasso_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,4,28.0
"# Lasso Model Feature Importance
lasso_feature_coef = pd.Series(index = X_train.columns, data = np.abs(lasso_model.coef_))
lasso_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,3,79.0
"from sklearn.linear_model import ElasticNetCV, ElasticNet

# Use ElasticNetCV to tune alpha automatically instead of redundantly using ElasticNet and GridSearchCV
el_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=5e-2, cv=10, n_jobs=-1)         
el_model.fit(X_train, y_train)
print(f'Optimal alpha: {el_model.alpha_:.6f}')
print(f'Optimal l1_ratio: {el_model.l1_ratio_:.3f}')
print(f'Number of iterations {el_model.n_iter_}')",No,4,6.0
"y_train_pred = el_model.predict(X_train)
y_pred = el_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')",No,3,28.0
"# ElasticNet Model Feature Importance
el_feature_coef = pd.Series(index = X_train.columns, data = np.abs(el_model.coef_))
n_features = (el_feature_coef>0).sum()
print(f'{n_features} features with reduction of {(1-n_features/len(el_feature_coef))*100:2.2f}%')
el_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,5,79.0
"import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
import os
from math import sqrt",No,5,22.0
"for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,5,88.0
"PATH='/kaggle/input/restaurant-revenue-prediction'

train_df=pd.read_csv(os.path.join(PATH,'train.csv.zip'))
test_df=pd.read_csv(os.path.join(PATH,'test.csv.zip'))",No,5,45.0
"print('Train Data Shape:',train_df.shape)
print('Test Data Shape:',test_df.shape)
print('Features:',train_df.columns)",No,4,58.0
train_df.head(),No,5,41.0
"sns.distplot(train_df['revenue'],hist=False)
plt.title('Target Variable Distribution')
plt.show()",No,5,33.0
train_df.isnull().sum(),No,5,39.0
"def get_month(date):
    return int(date.split('/')[0])

def get_year(date):
    return int(date.split('/')[-1])
    
train_df['Month']=train_df['Open Date'].apply(get_month)
train_df['Year']=train_df['Open Date'].apply(get_year)",No,5,8.0
"test_df['Month']=test_df['Open Date'].apply(get_month)
test_df['Year']=test_df['Open Date'].apply(get_year)",No,5,8.0
"print(train_df['Id'].shape) # the id has all unique values hence removing it
train_df.drop('Id',axis=1,inplace=True)
test_indexes=test_df['Id']
test_df.drop('Id',axis=1,inplace=True)",No,3,10.0
"plt.figure(figsize=(10,5))

sns.countplot(x='Month',data=train_df)
plt.xlabel('Opening Month')
plt.ylabel('Openings')
plt.title('No of openings per month')
plt.show()",No,5,75.0
"plt.figure(figsize=(13,5))
months_revenue_mean=train_df.groupby('Month')['revenue'].mean()
sns.pointplot(x=months_revenue_mean.index,y=months_revenue_mean.values)
plt.title('Revenue Vs Month')
plt.show()",No,5,33.0
"(train_df['Month']=='05').sum()
(train_df['Month']=='06').sum()
(train_df['Month']=='07').sum()",No,5,72.0
"plt.figure(figsize=(13,5))
sns.countplot(x='Year',data=train_df)
plt.ylabel('Number of Openings')
plt.title('Number Of Openings Per Year')
plt.show()",No,5,75.0
"plt.figure(figsize=(14,5))
year_revenue_means=train_df.groupby('Year')['revenue'].mean()
sns.pointplot(year_revenue_means.index,year_revenue_means.values)
plt.xlabel('Revenue')
plt.ylabel('Year')
plt.title('Revenue Per Year')
plt.show()",No,5,33.0
"print('Datapoints in Year 2013:',(train_df['Year']=='2013').sum())
print('Datapoints in Year 2014:',(train_df['Year']=='2014').sum())",No,5,40.0
"print(""City Group Categoies:"",train_df['City Group'].unique())'",No,5,57.0
"sns.countplot('City Group', data=train_df)
plt.title('City Group Counts')
plt.show()",No,5,33.0
train_df['City Group'].value_counts(),No,5,72.0
"city_group_revenue_means=train_df.groupby('City Group')['revenue'].sum()
city_group_revenue_means",No,5,60.0
"sns.lineplot(x='City Group',y='revenue',data=train_df)",No,5,81.0
"## converting it into dummies

city_group_dummies=pd.get_dummies(train_df['City Group'])
train_df=pd.concat([train_df,city_group_dummies],axis=1)",No,4,11.0
"test_city_group_dummies=pd.get_dummies(test_df['City Group'])
test_df=pd.concat([test_df,test_city_group_dummies],axis=1)",No,4,11.0
"print('Tyes in train df:',train_df['Type'].unique())
print('Types in test df:',test_df['Type'].unique())",No,5,57.0
"fig,ax = plt.subplots(1,2,figsize=(9,5))
sns.countplot(train_df.Type,ax=ax[0])
ax[0].set_title('Train set')
sns.countplot(test_df.Type,ax=ax[1])
ax[1].set_title('Test set')
plt.show()",No,5,33.0
"type_map={'IL':0,'FC':1,'DT':2,'MB':3}
train_df['Type']=train_df['Type'].apply(lambda type:type_map[type])
test_df['Type']=test_df['Type'].apply(lambda type:type_map[type])",No,4,20.0
"## converting the type into dummies
type_dummies=pd.get_dummies(train_df['Type'])
train_df=pd.concat([train_df,type_dummies],axis=1)
train_df['3']=[0]*train_df.shape[0]",No,4,11.0
"test_type_dummies=pd.get_dummies(test_df['Type'])
test_df=pd.concat([test_df,test_type_dummies],axis=1)",No,4,11.0
train_df['City'].unique() ,No,5,57.0
"# dropping all the columns which have been utilized already

train_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True)
test_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True)",No,5,10.0
test_df.head(),No,5,41.0
"from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import KFold",No,5,22.0
"print('Train Data Shape After EDA:',train_df.shape)
print('Test Data Shape After EDA:',test_df.shape)",No,5,58.0
"Y=train_df['revenue']
train_df.drop('revenue',axis=1,inplace=True)
X=train_df.values
X_Test=test_df.values",No,5,21.0
"X.shape,X_Test.shape",No,5,58.0
"regressor_models={
    'Linear Regression':LinearRegression(),
    'Decision Tree Regressor':DecisionTreeRegressor(),
    'Random Forest Regressor':RandomForestRegressor(),
    'SVR':SVR(),
}",No,3,23.0
"def get_rmse_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    y_predicted=model.predict(x_test)
    r2_score=model.score(x_test,y_test)
    rmse=sqrt(mean_squared_error(y_test,y_predicted))    
    return rmse,r2_score",No,4,28.0
"x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=42)",No,5,13.0
"svr_model=SVR()
svr_model.fit(x_train,y_train)
Y_Test_predictions=svr_model.predict(X_Test)",No,3,27.0
"predictions=[]

for index in range(len(Y_Test_predictions)):
        predictions.append([test_indexes[index],Y_Test_predictions[index]])",No,3,48.0
"predictions_df=pd.DataFrame(predictions,columns=['Id','Prediction'])
predictions_df.to_csv('SVM_Predictions.csv',index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,4,22.0
"import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"train_path = '../input/restaurant-revenue-prediction/train.csv.zip'
df_train = pd.read_csv(train_path)
df_train.head()",No,3,41.0
"test_path = '../input/restaurant-revenue-prediction/test.csv.zip'
df_test = pd.read_csv(test_path, index_col='Id')
df_test.head()",No,3,41.0
"from datetime import date, datetime

def calculate_age(born):
        born = datetime.strptime(born, ""%m/%d/%Y"").date()
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

df_train['Age'] = df_train['Open Date'].apply(calculate_age)
df_test['Age'] = df_test['Open Date'].apply(calculate_age)

# Drop 'Open Date' column from Dataframes
df_train = df_train.drop('Open Date', axis=1)
df_test = df_test.drop('Open Date', axis=1)

# Drop 'Id' column from Dataframes
df_train = df_train.drop('Id', axis=1)

df_train.head()'",No,3,41.0
"# Find the sets of categorical variables and numberical variales for feature analyses 
numerical_features = df_train.select_dtypes([np.number]).columns.tolist()
categorical_features = df_train.select_dtypes(exclude = [np.number,np.datetime64]).columns.tolist()
print(categorical_features)
print(numerical_features)",No,3,40.0
"print(df_train['revenue'].describe())
sns.distplot(a=df_train['revenue'], kde=True).set(xlabel='revenue', ylabel='P(revenue)')",No,5,33.0
df_train[df_train['revenue'] > 10000000 ],No,5,14.0
"# Drop outliers
df_train = df_train[df_train['revenue'] < 10000000 ]
df_train.reset_index(drop=True).head()",No,3,10.0
"fig, ax = plt.subplots(3, 1, figsize=(40, 30))
for variable, subplot in zip(categorical_features, ax.flatten()):
    df_2 = df_train[[variable,'revenue']].groupby(variable).revenue.sum().reset_index()
    df_2.columns = [variable,'total_revenue']
    sns.barplot(x=variable, y='total_revenue', data=df_2 , ax=subplot)
    subplot.set_xlabel(variable,fontsize=20)
    subplot.set_ylabel('Total Revenue',fontsize=20)
    for label in subplot.get_xticklabels():
        label.set_rotation(45)
        label.set_size(20)
    for label in subplot.get_yticklabels():
        label.set_size(20)
fig.tight_layout()",No,5,33.0
"fig, ax = plt.subplots(13, 3, figsize=(30, 35))
for variable, subplot in zip(numerical_features, ax.flatten()):
    sns.regplot(x=df_train[variable], y=df_train['revenue'], ax=subplot)
    subplot.set_xlabel(variable,fontsize=20)
    subplot.set_ylabel('Revenue',fontsize=20)
fig.tight_layout()",No,5,33.0
"plt.figure(figsize=(45,25))
mask = np.triu(np.ones_like(df_train.corr(), dtype=np.bool))
sns.heatmap(df_train.corr(),annot=True, mask=mask)
sns.set(font_scale=1.4)",No,5,80.0
"fig, ax = plt.subplots(3, 1, figsize=(40, 30))
for variable, subplot in zip(categorical_features, ax.flatten()):
    sns.swarmplot(x=variable, y='revenue', data=df_train, ax=subplot,size=10)
    subplot.set_xlabel(variable,fontsize=20)
    subplot.set_ylabel('Revenue',fontsize=20)
    for label in subplot.get_xticklabels():
        label.set_rotation(45)
        label.set_size(18)
    for label in subplot.get_yticklabels():
        label.set_size(18)
fig.tight_layout()",No,5,33.0
"#Lets take a look at city group field 
print(""--- Train set ---"")
print(df_train['City Group'].value_counts())
print(""---- Test set ----"")
print(df_test['City Group'].value_counts())'",No,5,72.0
"#Lets take a look at type field 
print(""--- Train set ---"")
print(df_train['Type'].value_counts())
print(""---- Test set ----"")
print(df_test['Type'].value_counts())
'",No,5,72.0
"y = df_train['revenue']
df_train=df_train.drop('revenue', axis=1)",No,5,21.0
"print(""Shapes: Train set "", df_train.shape ,"", Test "",df_test.shape)
df_full = pd.concat([df_train,df_test])
print(""Full dataset shapes: "", df_full.shape)",No,4,58.0
print('There are {} cities which restaurant location have been collected.'.format(len(df_full['City'].unique()))),No,3,58.0
"df_full = df_full.drop('City', axis=1)
df_full.shape",No,4,10.0
"p_name = ['P'+str(i) for i in range(1,38)]",No,3,58.0
"from sklearn.decomposition import PCA
pca = PCA().fit(df_full[p_name])
plt.figure(figsize=(7,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of Components')
plt.ylabel('Explained variance')
plt.yticks(np.arange(0.1,1.1,0.05))
plt.xticks(np.arange(0,41,2))
plt.grid(True)",No,3,23.0
"pca_list = ['pca'+str(i) for i in range(1,30,1)]
df_full[pca_list] = PCA(n_components=29).fit_transform(df_full[p_name])
df_full.drop(p_name,axis=1,inplace=True)",No,3,33.0
df_full.info(),No,4,10.0
"df=pd.get_dummies(df_full, dtype=float)",No,3,40.0
"# Get number of train sets
numTrain=df_train.shape[0]

train = df[:numTrain]
test = df[numTrain:]",No,3,13.0
"sns.distplot(a=y, kde=True).set(xlabel='revenue', ylabel='P(revenue)')",No,5,33.0
"print(""Kurtosis: {}"".format(y.kurt()))
print(""Skewness: {}"".format(y.skew()))",No,5,40.0
"from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
",No,4,21.0
best_estimators=[],No,5,77.0
"## Parameters
params = {
    ""alpha"" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    ""fit_intercept"" : [True, False],
    ""normalize"" : [True,False],
    ""solver"" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    ""tol"" : [0.0001, 0.001, 0.01, 0.1],
    ""random_state"" : [42]
}

## Ridge
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, params, scoring='r2', cv=5, n_jobs=-1)
ridge_grid.fit(X_train, y_train)

## Output
print(""Best parameters:  {}:"".format(ridge_grid.best_params_))
print(""Best score: {}"".format(ridge_grid.best_score_))

## Append to list
best_estimators.append([""Ridge"",ridge_grid.best_estimator_])'",No,4,23.0
"params = {
    'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'fit_intercept' : [True, False],
    'normalize' : [True,False],
    'tol' : [0.0001, 0.001, 0.01, 0.1],
    ""random_state"" : [42]
}

## Lasso
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, params, scoring='r2', cv=5, n_jobs=-1)
lasso_grid.fit(X_train, y_train)

## Output
print(""Best parameters:  {}:"".format(lasso_grid.best_params_))
print(""Best score: {}"".format(lasso_grid.best_score_))

## Append to list
best_estimators.append([""Lasso"",lasso_grid.best_estimator_])'",No,3,5.0
"# Parameters
params = {
    ""alpha"" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    ""fit_intercept"" : [True, False],
    ""normalize"" : [True,False],
    ""tol"" : [0.0001, 0.001, 0.01, 0.1],
    ""random_state"" : [42]
}

## Elastic Net
EL = ElasticNet()
EL_grid = GridSearchCV(EL, params, scoring='r2', cv=5, n_jobs=-1)
EL_grid.fit(X_train, y_train)

## Output
print(""Best parameters:  {}:"".format(EL_grid.best_params_))
print(""Best score: {}"".format(EL_grid.best_score_))

## Append to list
best_estimators.append([""ElasticNet"",EL_grid.best_estimator_])'",No,3,5.0
"# parameters
params = {
    ""learning_rate"": [.1, .5, .7, .9, .95, .99, 1],
    ""colsample_bytree"": [.3, .4, .5, .6],
    ""max_depth"": [2, 4],
    ""alpha"": [1, 3, 5],
    ""subsample"": [.5],
    ""n_estimators"": [30, 70, 100, 200],
    ""random_state"" : [42]
}

## XGBoost Regressor
XGBR =  XGBRegressor()
XGBR_grid = GridSearchCV(XGBR, params, scoring='r2', cv=5, n_jobs=-1)
XGBR_grid.fit(X_train, y_train)

## Output
print(""Best parameters:  {}:"".format(XGBR_grid.best_params_))
print(""Best score: {}"".format(XGBR_grid.best_score_))

## Append to list
best_estimators.append([""XGBoostR"",XGBR_grid.best_estimator_])'",No,3,5.0
"## parameters
params = {
    ""n_estimators"": [10, 30, 50, 100],
    ""learning_rate"": [.01, 0.1, 0.5, 0.9, 0.95, 1],
    ""random_state"" : [42]
}

## XGBoost Regressor
AdaBoostR =   AdaBoostRegressor()
AdaBoostR_grid = GridSearchCV(AdaBoostR, params, scoring='r2', cv=5, n_jobs=-1)
AdaBoostR_grid.fit(X_train, y_train)

## Output
print(""Best parameters:  {}:"".format(AdaBoostR_grid.best_params_))
print(""Best score: {}"".format(AdaBoostR_grid.best_score_))

## Append to list
best_estimators.append([""AdaBoostR"",AdaBoostR_grid.best_estimator_])'",No,3,5.0
"from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## pipeline
pipelines = []

for name,model in best_estimators:
    pipeline = Pipeline([(""Scaler"",StandardScaler()),
                            (name,model)
                        ])
    pipelines.append([""Scaled_""+name,pipeline])",No,3,5.0
"from sklearn.model_selection import KFold, cross_val_score

## Create a dataframe to store all the models' cross validation score
evaluate = pd.DataFrame(columns=[""model"",""cv"",""std""])


## Encoded dataset
for name,model in pipelines:
    kfold = KFold(n_splits=10,random_state=42)
    cv = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, scoring=""neg_root_mean_squared_error"")
    
    row = evaluate.shape[0]
    evaluate.loc[row,""model""] = name
    evaluate.loc[row,""cv""] = round(cv.mean(),3)
    evaluate.loc[row,""std""] = ""+/- {}"".format(round(cv.std(),4))
    
    evaluate = evaluate.sort_values(""cv"",ascending=False)

evaluate'",No,2,22.0
from sklearn.ensemble import VotingRegressor,No,3,56.0
"## Creating a list for all combinations models
votings = []

##  All models
votings.append((""Scaled_all_models"",Pipeline([(""Scaler"",StandardScaler()),
                                      (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_),
                                                                  (""AdaBoostR"", AdaBoostR_grid.best_estimator_),
                                                                  (""Ridge"",ridge_grid.best_estimator_)
                                                                 ])                                  
                                    )])))


### Combinations of two estimators

## Combination of RandomForestRegressor with  BaggingRegressor & GradientBoostRegressor
votings.append((""Scaled_XGBR_AB"",Pipeline([(""Scaler"",StandardScaler()),
                                      (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_),
                                                                  (""AdaBoostR"", AdaBoostR_grid.best_estimator_)
                                                                 ]))])
               ))

votings.append((""Scaled_XGBR_R"",Pipeline([(""Scaler"",StandardScaler()),
                                      (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_),
                                                                  (""Ridge"",ridge_grid.best_estimator_)
                                                                 ]))])))

votings.append((""Scaled_AB_R"",Pipeline([(""Scaler"",StandardScaler()),
                                      (""Votings"",VotingRegressor([(""AdaBoostR"", AdaBoostR_grid.best_estimator_),
                                                                  (""Ridge"",ridge_grid.best_estimator_)
                                                                 ]))])))",No,5,82.0
!pip install python-googlegeocoder,No,4,22.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from googlegeocoder import GoogleGeocoder
from bokeh.plotting import figure, save
import plotly.graph_objects as go
import sklearn
import warnings
warnings.filterwarnings(""ignore"")
plt.style.use('ggplot')
pd.plotting.register_matplotlib_converters()'",No,5,23.0
"train_df=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test_df=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')
all_data=pd.concat([train_df,test_df],axis=0)
all_data.reset_index(drop=True)",No,3,45.0
"train_df.info()
",No,5,40.0
train_df['Country_Region'].nunique(),No,5,54.0
"print(""fill blanks and add region for counting"")

train_df.drop('Province_State',axis=1,inplace=True)'",No,5,10.0
"# Resetting Date column into Datetime object and making it an index of dataframe
train_df['Date']=pd.to_datetime(train_df['Date'])
train_df.set_index('Date',inplace=True)",No,4,16.0
"from sklearn.neighbors import KNeighborsRegressor

params_knn = {
    'n_neighbors' : [3, 5, 7, 9, 11],
}

knn_model = KNeighborsRegressor()
knn_regressor = GridSearchCV(knn_model, params_knn, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)
knn_regressor.fit(X_train, y_train)
print(f'Optimal neighbors: {knn_regressor.best_params_[""n_neighbors""]}')
print(f'Best score: {knn_regressor.best_score_}')'",No,4,6.0
"knn_model = KNeighborsRegressor(n_neighbors=knn_regressor.best_params_[""n_neighbors""])
knn_model.fit(X_train, y_train)
y_train_pred = knn_model.predict(X_train)
y_pred = knn_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0
"from sklearn.ensemble import RandomForestRegressor

params_rf = {
    'max_depth': [10, 30, 35, 50, 65, 75, 100],
    'max_features': [.3, .4, .5, .6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [30, 50, 100, 200]
}

rf = RandomForestRegressor()
rf_regressor = GridSearchCV(rf, params_rf, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1)
rf_regressor.fit(X_train, y_train)
print(f'Optimal depth: {rf_regressor.best_params_[""max_depth""]}')
print(f'Optimal max_features: {rf_regressor.best_params_[""max_features""]}')
print(f'Optimal min_sample_leaf: {rf_regressor.best_params_[""min_samples_leaf""]}')
print(f'Optimal min_samples_split: {rf_regressor.best_params_[""min_samples_split""]}')
print(f'Optimal n_estimators: {rf_regressor.best_params_[""n_estimators""]}')
print(f'Best score: {rf_regressor.best_score_}')'",No,5,2.0
"rf_model = RandomForestRegressor(max_depth=rf_regressor.best_params_[""max_depth""], 
                                 max_features=rf_regressor.best_params_[""max_features""], 
                                 min_samples_leaf=rf_regressor.best_params_[""min_samples_leaf""], 
                                 min_samples_split=rf_regressor.best_params_[""min_samples_split""], 
                                 n_estimators=rf_regressor.best_params_[""n_estimators""], 
                                 n_jobs=-1, oob_score=True)
rf_model.fit(X_train, y_train)
y_train_pred = rf_model.predict(X_train)
y_pred = rf_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0
"# 
# City, City Group, Type
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_all[""City""] = le.fit_transform(df_all[""City""])
df_all[""City Group""] = le.fit_transform(df_all[""City Group""])
df_all[""Type""] = le.fit_transform(df_all[""Type""])
# df_all'",No,4,20.0
"# 

df_train_fin = df_all.iloc[:df_train.shape[0]]   # df_train
df_test_fin = df_all.iloc[df_train.shape[0]:]   # df_test'",No,5,13.0
"from sklearn.ensemble import RandomForestRegressor

#  IDOpenDate
out_columns = [""Id"", ""Open Date""]
columns = []

for i in df_train_fin.columns:
    if i not in out_columns:
        columns.append(i)

x_train = df_train_fin[columns]


# 
rfr = RandomForestRegressor(
    n_estimators=200, 
    max_depth=5, 
    max_features=0.5, 
    random_state=449,
    n_jobs=-1
)
rfr.fit(x_train, y_train)


# 
rfr.score(x_train, y_train)'",No,3,7.0
pred = rfr.predict(df_test_fin[columns]),No,5,48.0
df_submission,No,5,41.0
"df_submission['Prediction'] = pred
df_submission.to_csv('/kaggle/working/RandamForest_submission01.csv', index=False)",No,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
# from pandas_profiling import ProfileReport


# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,5,88.0
"train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')
submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0
"from plotly.offline import iplot
from plotly import tools
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = ""plotly_dark""
py.init_notebook_mode(connected=True)",No,5,23.0
"latest_grouped = train.groupby('Country_Region')['ConfirmedCases', 'Fatalities'].sum().reset_index()",No,5,60.0
"fig = px.bar(latest_grouped.sort_values('ConfirmedCases', ascending=False)[:20][::-1], 
             x='ConfirmedCases', y='Country_Region',
             title='Confirmed Cases Worldwide', text='ConfirmedCases', height=1000, orientation='h')
fig.show()",No,5,33.0
"europe = list(['Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
               'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
               'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
               'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])
europe_grouped_latest = latest_grouped[latest_grouped['Country_Region'].isin(europe)]",No,5,14.0
"temp = train[train['Country_Region'].isin(europe)]
temp = temp.groupby(['Date', 'Country_Region'])['ConfirmedCases'].sum().reset_index()
temp['Date'] = pd.to_datetime(temp['Date']).dt.strftime('%m/%d/%Y')
temp['size'] = temp['ConfirmedCases'].pow(0.3) * 3.5

fig = px.scatter_geo(temp, locations=""Country_Region"", locationmode='country names', 
                     color=""ConfirmedCases"", size='size', hover_name=""Country_Region"", 
                     range_color=[1,100],scope='europe',
                     projection=""natural earth"", animation_frame=""Date"", 
                     title='COVID-19: Cases Over Time', color_continuous_scale='Cividis_r')
fig.show()'",No,5,33.0
"fig = px.bar(europe_grouped_latest.sort_values('ConfirmedCases', ascending=False)[:10][::-1], 
             x='ConfirmedCases', y='Country_Region', color_discrete_sequence=['#84DCC6'],
             title='Confirmed Cases in Europe', text='ConfirmedCases', orientation='h')
fig.show()",No,5,33.0
"usa = train[train['Country_Region'] == ""US""]
usa_latest = usa[usa['Date'] == max(usa['Date'])]
usa_latest = usa_latest.groupby('Province_State')['ConfirmedCases', 'Fatalities'].max().reset_index()
fig = px.bar(usa_latest.sort_values('ConfirmedCases', ascending=False)[:10][::-1], 
             x='ConfirmedCases', y='Province_State', color_discrete_sequence=['#D63230'],
             title='Confirmed Cases in USA', text='ConfirmedCases', orientation='h')
fig.show()'",No,5,33.0
"ch = train[train['Country_Region'] == ""China""]
ch = ch[ch['Date'] == max(ch['Date'])]
ch = ch.groupby('Province_State')['ConfirmedCases', 'Fatalities'].max().reset_index()
fig = px.bar(ch.sort_values('ConfirmedCases', ascending=False)[:10][::-1], 
             x='ConfirmedCases', y='Province_State', color_discrete_sequence=['#D63230'],
             title='Confirmed Cases in china', text='ConfirmedCases', orientation='h')
fig.show()'",No,5,33.0
"province_encoded = {state:index for index, state in enumerate(train['Province_State'].unique())}",No,5,77.0
"train['province_encoded'] = train['Province_State'].apply(lambda x: province_encoded[x])
train.head()",No,4,20.0
"country_encoded = dict(enumerate(train['Country_Region'].unique()))
country_encoded = dict(map(reversed, country_encoded.items()))",No,5,20.0
"train['country_encoded'] = train['Country_Region'].apply(lambda x: country_encoded[x])
train.head()",No,4,20.0
"from datetime import datetime
import time",No,5,22.0
"train['Mon'] = train['Date'].apply(lambda x: int(x.split('-')[1]))
train['Day'] = train['Date'].apply(lambda x: int(x.split('-')[2]))",No,5,8.0
"train['serial'] = train['Mon'] * 30 + train['Day']
train.head()",No,4,8.0
train['serial'] = train['serial'] - train['serial'].min(),No,5,8.0
"gdp2020 = pd.read_csv('/kaggle/input/gdp2020/GDP2020.csv')
population2020 = pd.read_csv('/kaggle/input/population2020/population2020.csv')",No,5,45.0
"gdp2020 = gdp2020.rename(columns={""rank"":""rank_gdp""})
gdp2020_numeric_list = [list(gdp2020)[0]] + list(gdp2020)[2:-1]
gdp2020.head()",No,4,61.0
set(train['Country_Region']) - set(population2020['name']),No,5,57.0
set(train['Country_Region']) - set(gdp2020['country']),No,5,57.0
"population2020 = population2020.rename(columns={""rank"":""rank_pop""})
population2020_numeric_list = [list(population2020)[0]] + list(gdp2020)[2:]
population2020.head()",No,4,61.0
"train = pd.merge(train, population2020, how='left', left_on = 'Country_Region', right_on = 'name')
train = pd.merge(train, gdp2020, how='left', left_on = 'Country_Region', right_on = 'country')",No,5,32.0
train = train.fillna(-1),No,5,17.0
"# numeric_features_X = ['Lat','Long', 'province_encoded' ,'country_encoded','Mon','Day']
numeric_features_X = ['province_encoded' ,'country_encoded','Mon','Day'] + population2020_numeric_list + gdp2020_numeric_list
numeric_features_Y = ['ConfirmedCases', 'Fatalities']
train_numeric_X = train[numeric_features_X]
train_numeric_Y = train[numeric_features_Y]",No,5,21.0
test['province_encoded'] = test['Province_State'].apply(lambda x: province_encoded[x] if x in province_encoded else max(province_encoded.values())+1),No,5,8.0
test['country_encoded'] = test['Country_Region'].apply(lambda x: country_encoded[x] if x in country_encoded else max(country_encoded.values())+1),No,5,8.0
"test['Mon'] = test['Date'].apply(lambda x: int(x.split('-')[1]))
test['Day'] = test['Date'].apply(lambda x: int(x.split('-')[2]))",No,3,16.0
"test['serial'] = test['Mon'] * 30 + test['Day']
test['serial'] = test['serial'] - test['serial'].min()",No,4,8.0
"test = pd.merge(test, population2020, how='left', left_on = 'Country_Region', right_on = 'name')
test = pd.merge(test, gdp2020, how='left', left_on = 'Country_Region', right_on = 'country')",No,5,32.0
"test_numeric_X = test[numeric_features_X]
test_numeric_X.isnull().sum()",No,4,21.0
test_numeric_X = test_numeric_X.fillna(-1),No,5,17.0
"from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression",No,5,22.0
"# Random Forest Model Feature Importance
rf_feature_importance = pd.Series(index = X_train.columns, data = np.abs(rf_model.feature_importances_))
n_features = (rf_feature_importance>0).sum()
print(f'{n_features} features with reduction of {(1-n_features/len(rf_feature_importance))*100:2.2f}%')
rf_feature_importance.sort_values().plot(kind = 'bar', figsize = (13,5));",No,2,79.0
"import lightgbm as lgbm

params_lgbm = {
    'learning_rate': [.01, .1, .5, .7, .9, .95, .99, 1],
    'boosting': ['gbdt'],
    'metric': ['l1'],
    'feature_fraction': [.3, .4, .5, 1],
    'num_leaves': [20],
    'min_data': [10],
    'max_depth': [10],
    'n_estimators': [10, 30, 50, 100]
}

lgb = lgbm.LGBMRegressor()
lgb_regressor = GridSearchCV(lgb, params_lgbm, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1)
lgb_regressor.fit(X_train, y_train)
print(f'Optimal lr: {lgb_regressor.best_params_[""learning_rate""]}')
print(f'Optimal feature_fraction: {lgb_regressor.best_params_[""feature_fraction""]}')
print(f'Optimal n_estimators: {lgb_regressor.best_params_[""n_estimators""]}')
print(f'Best score: {lgb_regressor.best_score_}')'",No,5,6.0
"lgb_model = lgbm.LGBMRegressor(learning_rate=lgb_regressor.best_params_[""learning_rate""], boosting='gbdt', 
                               metric='l1', feature_fraction=lgb_regressor.best_params_[""feature_fraction""], 
                               num_leaves=20, min_data=10, max_depth=10, 
                               n_estimators=lgb_regressor.best_params_[""n_estimators""], n_jobs=-1)
lgb_model.fit(X_train, y_train)
y_train_pred = lgb_model.predict(X_train)
y_pred = lgb_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0
"# LightGBM Feature Importance
lgb_feature_importance = pd.Series(index = X_train.columns, data = np.abs(lgb_model.feature_importances_))
n_features = (lgb_feature_importance>0).sum()
print(f'{n_features} features with reduction of {(1-n_features/len(lgb_feature_importance))*100:2.2f}%')
lgb_feature_importance.sort_values().plot(kind = 'bar', figsize = (13,5));",No,5,79.0
"params_xgb = {
    'learning_rate': [.1, .5, .7, .9, .95, .99, 1],
    'colsample_bytree': [.3, .4, .5, .6],
    'max_depth': [4],
    'alpha': [3],
    'subsample': [.5],
    'n_estimators': [30, 70, 100, 200]
}

xgb_model = XGBRegressor()
xgb_regressor = GridSearchCV(xgb_model, params_xgb, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1)
xgb_regressor.fit(X_train, y_train)
print(f'Optimal lr: {xgb_regressor.best_params_[""learning_rate""]}')
print(f'Optimal colsample_bytree: {xgb_regressor.best_params_[""colsample_bytree""]}')
print(f'Optimal n_estimators: {xgb_regressor.best_params_[""n_estimators""]}')
print(f'Best score: {xgb_regressor.best_score_}')'",No,5,6.0
"xgb_model = XGBRegressor(learning_rate=xgb_regressor.best_params_[""learning_rate""], 
                         colsample_bytree=xgb_regressor.best_params_[""colsample_bytree""], 
                         max_depth=4, alpha=3, subsample=.5, 
                         n_estimators=xgb_regressor.best_params_[""n_estimators""], n_jobs=-1)
xgb_model.fit(X_train, y_train)
y_train_pred = xgb_model.predict(X_train)
y_pred = xgb_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')'",No,3,28.0
"# XGB with early stopping
xgb_model.fit(X_train, y_train, early_stopping_rounds=4,
             eval_set=[(X_test, y_test)], verbose=False)
y_train_pred = xgb_model.predict(X_train)
y_pred = xgb_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')",No,3,28.0
"# XGB Feature Importance, relevant features can be selected based on its score
feature_important = xgb_model.get_booster().get_fscore()
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=['score']).sort_values(by = 'score', ascending=True)
data.plot(kind='bar', figsize = (13,5))
plt.show()",No,3,79.0
"rf_model_en = RandomForestRegressor(max_depth=200, max_features=0.4, min_samples_leaf=3, 
                                 min_samples_split=6, n_estimators=30, n_jobs=-1, oob_score=True)
rf_model_en.fit(X_train, y_train)
y_train_pred = rf_model_en.predict(X_train)
y_pred = rf_model_en.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')",No,4,28.0
"from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot

# get a stacking ensemble of models
def get_stacking():
    # define the base models
    base_models = list()
    base_models.append(('ridge', ridge_model))
    base_models.append(('lasso', lasso_model))
    base_models.append(('rf', rf_model_en))
    # define meta learner model
    learner = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=base_models, final_estimator=learner, cv=10)
    return model
 
# get a list of models to evaluate
def get_models():
    models = dict()
    models['ridge'] = ridge_model
    models['lasso'] = lasso_model
    models['rf_en'] = rf_model_en
    models['stacking'] = get_stacking()
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=19)
    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print(f'{name} {mean(scores):.3f} {std(scores):.3f}')
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()",No,2,4.0
"# define the base models
base_models = list()
base_models.append(('ridge', ridge_model))
base_models.append(('lasso', lasso_model))
base_models.append(('rf', rf_model_en))
# define meta learner model
learner = LinearRegression()
# define the stacking ensemble
stack1 = StackingRegressor(estimators=base_models, final_estimator=learner, cv=10)
# fit the model on all available data
stack1.fit(X, y)",No,5,82.0
"pivot=pd.pivot_table(train_df,columns='Country_Region',index='Date',values='ConfirmedCases',aggfunc=np.sum)
pivot_fatality=pd.pivot_table(train_df,columns='Country_Region',index='Date',values='Fatalities',aggfunc=np.sum)
country_list=[]
value_list=[]
fatality_list=[]
for country in list(pivot.columns):
    country_list.append(country)
    value_list.append(pivot[country].max())
    fatality_list.append(pivot_fatality[country].max())
    new_dict={'Country':country_list,'Confirmed':value_list,'Fatality':fatality_list}
df=pd.DataFrame.from_dict(new_dict)
df.set_index('Country',inplace=True)

plt.figure(figsize=(12,8))
plt.subplot(2,1,1)
df['Confirmed'].sort_values(ascending=False)[:10].plot(kind='bar',color='blue')
plt.title('Top 10 Countries by Confirmed Cases')
plt.subplot(2,1,2)
df['Fatality'].sort_values(ascending=False)[:10].plot(kind='bar',color='red')
plt.title('Top 10 Countries with Fatalities due to Covid-19')
plt.tight_layout()",No,5,33.0
"top_confirmed=df.sort_values(by='Confirmed',ascending=False)[:10]",No,5,9.0
"times_series_cntr = train_df.groupby(['Date','Country_Region'])['ConfirmedCases'].sum()\\
                    .reset_index().set_index('Date')
df_countries_tm = times_series_cntr[times_series_cntr['Country_Region'].isin(list_countries)]


plt.figure(figsize=(16,12))
ax = sns.lineplot(x=df_countries_tm.index, y=""ConfirmedCases"", hue=""Country_Region"", data=df_countries_tm,palette='muted').set_title('Cumulative line')
plt.legend(loc=2, prop={'size': 12})
plt.title('Cumulative trend plot for Confirmed Cases')
plt.xticks(rotation=90);'",No,5,81.0
"from sklearn import linear_model
import numpy 
from sklearn.ensemble import RandomForestRegressor
cls = RandomForestRegressor(n_estimators=100)
cls.fit(X_train, Y_train)
pred = cls.predict(X_test)
pred = numpy.exp(pred)
cls.score(X_train, Y_train)",No,2,7.0
"output = pd.DataFrame({'Id': test['Id'],
                     'Prediction': pred})
output.to_csv('submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,5,22.0
"#dependencies
import pandas as pd
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing, svm  
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
import math
from keras import metrics
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, LSTM 
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVR
from xgboost import XGBRegressor",No,5,23.0
"df_train = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv', index_col=0)",No,5,45.0
"#df_train['Fatalities'].plt.show()
df_train.drop(columns=['Province_State'], inplace=True)",No,4,10.0
"df_train.fillna(0, inplace=True)
#df_train.set_index('Date', inplace=True)",No,5,17.0
"testData = pd.read_csv(""../input/test.csv"")
submission = pd.DataFrame({
        ""Id"": testData[""Id""],
        ""Prediction"": pred
    })
submission.to_csv('RandomForestSimple.csv',header=True, index=False)'",No,4,45.0
"le = preprocessing.LabelEncoder()
df_train['Country_Region'] = le.fit_transform(df_train['Country_Region'])
df_train['Date'] = le.fit_transform(df_train['Date'])
df_train",No,5,20.0
"X = df_train.drop(columns=['Fatalities','ConfirmedCases']) 
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(X) #t_scaled_data = preprocessing.scale(X)
X= np.array(X)
X = preprocessing.scale(X)",No,3,10.0
"scaler.scale_
scale=1/ 1.51515152e-02",No,5,53.0
"# Import necessary libraries

import datetime
import numpy as np
import pandas as pd
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')

# Import ML libraries

from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score",No,5,23.0
"y = df_train.drop(columns=['Date','Country_Region','Fatalities'])
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(y)
y = np.array(y)
y = preprocessing.scale(y)",No,2,10.0
"# Load and read files
submission_example = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")
train_df = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv')
test_df = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv')

# Rename columns
train_df.rename(columns={'Country_Region': 'Country'}, inplace=True)
train_df.rename(columns={'Province_State': 'State'}, inplace=True)
test_df.rename(columns={'Country_Region': 'Country'}, inplace=True)
test_df.rename(columns={'Province_State': 'State'}, inplace=True)

display(train_df.head(5))
display(test_df.head(5))
train_df.info()
print('\
')
test_df.info()'",No,3,45.0
"# Transform the normal date to pandas datetime
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

display(train_df.head(5))
display(test_df.head(5))",No,4,16.0
19698-15523,No,5,53.0
X.shape,No,5,58.0
"X_train,X_test, y_train, y_test =  train_test_split(X, y, test_size=0.20)",No,5,13.0
X_train.shape[1],No,5,58.0
"#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
#X_train.shape",No,1,53.0
model = XGBRegressor(n_estimators=1000),No,5,4.0
"model.fit(X_train, y_train) #,batch_size = 50, epochs= 20)",No,5,7.0
"# Use the forest's predict method on the test data
prediction_s = model.predict(X_test)
# Calculate the absolute errors
errors_s = abs(prediction_s - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors_s), 2), 'degrees.')",No,4,27.0
"accuracy = model.score(X_test, y_test) #test Accuracy squared error for linreg",No,5,49.0
"# Shape of training data
print(train_df.shape)

# Number of missing values in each column of training data
missing_train_count_col = (train_df.isnull().sum())
print(missing_train_count_col[missing_train_count_col>0])

# Shape of testing data
print(test_df.shape)
# Number of missing values in each column of training data
missing_test_count_col = (test_df.isnull().sum())
print(missing_test_count_col[missing_test_count_col>0])",No,4,39.0
"# define the base models
base_model = list()
base_model.append(('rf1', rf_model))
base_model.append(('rf2', rf_model_en))
base_model.append(('rf3', RandomForestRegressor(max_depth=8, max_features=0.1, min_samples_leaf=3, 
                                                min_samples_split=2, n_estimators=250, n_jobs=-1, oob_score=False)))
# define meta learner model
learner = LinearRegression()
# define the stacking ensemble
stack2 = StackingRegressor(estimators=base_model, final_estimator=learner, cv=10)
# fit the model on all available data
stack2.fit(X, y)",No,4,7.0
"df_t = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv',index_col=0)",No,5,45.0
"df_t.drop(columns=['Province_State'], inplace=True)",No,5,10.0
df_t,No,5,41.0
"df_t['Country_Region'] = le.fit_transform(df_t['Country_Region'])
df_t['Date'] = le.fit_transform(df_t['Date'])",No,5,20.0
"submission = pd.DataFrame(columns=['Id','Prediction'])
submission['Id'] = test_df['Id']

ridge_pred = ridge_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(ridge_pred)
submission.to_csv('submission_ridge.csv',index=False)

lasso_pred = lasso_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(lasso_pred)
submission.to_csv('submission_lasso.csv',index=False)

elastic_pred = el_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(elastic_pred)
submission.to_csv('submission_elastic.csv',index=False)

knn_pred = knn_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(knn_pred)
submission.to_csv('submission_knn.csv',index=False)

rf_pred = rf_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(rf_pred)
submission.to_csv('submission_rf.csv',index=False)

lgb_pred = lgb_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(lgb_pred)
submission.to_csv('submission_lgb.csv',index=False)

xgb_pred = xgb_model.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(xgb_pred)
submission.to_csv('submission_xgb.csv',index=False)

stack_pred1 = stack1.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(stack_pred1)
submission.to_csv('submission_stack1.csv',index=False)

stack_pred2 = stack2.predict(test_df.drop('Id', axis=1))
submission['Prediction'] = np.expm1(stack_pred2)
submission.to_csv('submission_stack2.csv',index=False)",Yes,4,25.0
"#df_t = np.array(scaled_data)
#t_scaled_data = scaler.fit_transform(df_t)
#t_scaled_data = preprocessing.scale(df_t)",No,1,53.0
"#Libraries to import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import pycountry
import plotly_express as px
sns.set_style('darkgrid')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance, plot_tree",No,4,22.0
"df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') 
df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')",No,5,45.0
"test_predictions = model.predict(subt_data)
test_predictions.shape",No,4,27.0
"#test_predictions = test_predictions.reshape(12642,)
#test_predictions = test_predictions.reshape(-1, 3)",No,1,53.0
#test_predictions = scaler.inverse_transform(test_predictions),No,1,53.0
"display(df_train.head())
display(df_train.describe())
display(df_train.info())",No,3,40.0
"#INVERSE TRANSFORM
#test_predictions = test_predictions.reshape(12642,)
#test_predictions_c = test_predictions* scale",No,1,53.0
"df_train['Date'] = pd.to_datetime(df_train['Date'], format = '%Y-%m-%d')
df_test['Date'] = pd.to_datetime(df_test['Date'], format = '%Y-%m-%d')",No,5,16.0
"#test_predictions = test_predictions.reshape(12642,)
test_predictions = test_predictions_c",No,4,12.0
"print('Minimum date from training set: {}'.format(df_train['Date'].min()))
print('Maximum date from training set: {}'.format(df_train['Date'].max()))",No,5,40.0
"b""# Fill null values\ntrain_df['State'].fillna('No State', inplace=True)\ntest_df['State'].fillna('No State', inplace=True)\n\n# Number of missing values in each column of training data\nmissing_train_count_col = (train_df.isnull().sum())\nprint(missing_train_count_col[missing_train_count_col>0])\n\n# Number of missing values in each column of training data\nmissing_test_count_col = (test_df.isnull().sum())\nprint(missing_test_count_col[missing_test_count_col>0])\nprint('\\n')\n\n# Double check no remaining missing values\ntrain_df.info()\nprint('\\n')\ntest_df.info()""",No,3,17.0
"b""# Apply Label Encoding to train and test data\ntrain_df_encoded = train_df.copy()\ntest_df_encoded = test_df.copy()\n\n# Initialize Label encoder\nle = LabelEncoder()\n\n# Create date time features\ndef create_time_features(df):\n    df['date'] = df['Date']\n    df['hour'] = df['date'].dt.hour\n    df['dayofweek'] = df['date'].dt.dayofweek\n    df['quarter'] = df['date'].dt.quarter\n    df['month'] = df['date'].dt.month\n    df['year'] = df['date'].dt.year\n    df['dayofyear'] = df['date'].dt.dayofyear\n    df['dayofmonth'] = df['date'].dt.day\n    df['weekofyear'] = df['date'].dt.weekofyear\n    \n    return df\n\ntrain_df_encoded = create_time_features(train_df_encoded)\ntest_df_encoded = create_time_features(test_df_encoded)\ntrain_df_encoded.State = le.fit_transform(train_df_encoded.State)\ntrain_df_encoded.Country = le.fit_transform(train_df_encoded.Country)\ntest_df_encoded.State = le.fit_transform(test_df_encoded.State)\ntest_df_encoded.Country = le.fit_transform(test_df_encoded.Country)\n\ndisplay(train_df_encoded.tail())\nprint('\\n')\ndisplay(test_df_encoded.tail())""",No,4,8.0
"df_sub = pd.read_csv('../input/covid19-global-forecasting-week-2/submission.csv')
df_sub.drop(columns=['Fatalities','ConfirmedCases'], inplace=True)

save_file_c = pd.DataFrame(test_predictions_c, columns=[['ConfirmedCases']])

result_c = pd.merge(df_sub, save_file_c,left_index=True, right_index=True)
result_c.columns = ['ForecastId','ConfirmedCases']",No,4,12.0
"print('Minimum date from test set: {}'.format(df_test['Date'].min()))
print('Maximum date from test set: {}'.format(df_test['Date'].max()))",No,5,40.0
"result_c = pd.merge(df_t,result_c ,on='ForecastId')
df_t.drop(columns=['Country_Region'], inplace=True)
#result_c.drop(columns=['Country_Region_y','Country_Region_x'], inplace=True",No,4,10.0
"# Specify all features for prediction
x_features_drop = ['ConfirmedCases', 'Fatalities', 'Date', 'date']
y_target1 = ['ConfirmedCases']
y_target2 = ['Fatalities']

# Assign features into X, y1, y2 for training and testing
X = train_df_encoded.drop(x_features_drop, axis=1)
y1 = train_df_encoded[y_target1]
y2 = train_df_encoded[y_target2]

display(X.head())
display(y1.tail())
display(y2.tail())",No,4,10.0
"df_map = df_train.copy()
df_map['Date'] = df_map['Date'].astype(str)
df_map = df_map.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()",No,3,60.0
"def get_iso3_util(country_name):
    try:
        country = pycountry.countries.get(name=country_name)
        return country.alpha_3
    except:
        if 'Congo' in country_name:
            country_name = 'Congo'
        elif country_name == 'Diamond Princess' or country_name == 'Laos':
            return country_name
        elif country_name == 'Korea, South':
            country_name = 'Korea, Republic of'
        elif country_name == 'Taiwan*':
            country_name = 'Taiwan'
        country = pycountry.countries.search_fuzzy(country_name)
        return country[0].alpha_3

d = {}
def get_iso3(country):
    if country in d:
        return d[country]
    else:
        d[country] = get_iso3_util(country)
    
df_map['iso_alpha'] = df_map.apply(lambda x: get_iso3(x['Country_Region']), axis=1)",No,5,8.0
"df_map['ln(ConfirmedCases)'] = np.log(df_map.ConfirmedCases + 1)
df_map['ln(Fatalities)'] = np.log(df_map.Fatalities + 1)",No,5,8.0
"b""# # Split into validaion and training data on 2 features\nrft1_train_X, rft1_val_X, rft1_train_y, rft1_val_y = train_test_split(X, y1, train_size=0.8, test_size=0.2, random_state=1)\nrft2_train_X, rft2_val_X, rft2_train_y, rft2_val_y = train_test_split(X, y2, train_size=0.8, test_size=0.2, random_state=2)\n\n# Define the models\nmodel_1 = DecisionTreeClassifier(splitter='best', max_features='log2', random_state=42)\nmodel_2 = DecisionTreeClassifier(splitter='random', max_features='log2', random_state=42)\nmodel_3 = DecisionTreeClassifier(splitter='best', max_features='sqrt', random_state=42)\nmodel_4 = DecisionTreeClassifier(splitter='random', max_features='sqrt', random_state=42)\nmodel_5 = DecisionTreeClassifier(splitter='random', max_features='log2', random_state=42)\nmodel_6 = DecisionTreeClassifier(splitter='random', max_features='sqrt', random_state=42)\nmodel_7 = DecisionTreeClassifier(splitter='best', max_features='log2', random_state=42)\nmodel_8 = DecisionTreeClassifier(splitter='best', max_features='sqrt', random_state=42)\n\nrf_models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7, model_8]\n\n# Function for comparing different models\ndef score_model(model, train_X, val_X, train_y, val_y):\n    model.fit(train_X, train_y)\n    preds = model.predict(val_X)\n    #accuracy = accuracy_score(y_v, preds)\n    return mean_absolute_error(val_y, preds)\n\n# Evaluate the models for y1:\nfor i in range(0, len(rf_models)):\n    mae = score_model(rf_models[i], rft1_train_X, rft1_val_X, rft1_train_y, rft1_val_y)\n    print('Model %d MAE y1: %d' % (i+1, mae))\n\nprint('\\n')\n    \n# Evaluate the models for y2:\nfor i in range(0, len(rf_models)):\n    mae = score_model(rf_models[i], rft2_train_X, rft2_val_X, rft2_train_y, rft2_val_y)\n    print('Model %d MAE y2: %d' % (i+1, mae))""",No,5,3.0
"result_c.set_index('ForecastId', inplace=True)",No,5,84.0
"result_c['ConfirmedCases'] = [0 if result_c.loc[i, 'ConfirmedCases'] <= -0 
                                else result_c.loc[i, 'ConfirmedCases'] for i in result_c.index]",No,5,8.0
"#Fatalities X
X = df_train.drop(columns=['Fatalities', ]) 
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(X) #t_scaled_data = preprocessing.scale(X)
X= np.array(X)
X = preprocessing.scale(X)",No,3,10.0
"#Fatalities y
y = df_train.drop(columns=['Date','Country_Region','ConfirmedCases'])
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(y)
y = np.array(y)
y = preprocessing.scale(y)",No,4,10.0
"# Choose best Random Forest Model for y1 and y2
best_rf_model_y1 = model_2
best_rf_model_y2 = model_2

# Assign features to test data
x_test_features_drop = ['Date', 'date']
X_test = test_df_encoded.drop(x_test_features_drop, axis=1)

# Predict the best model for y1 and y2
y1_pred = best_rf_model_y1.predict(X_test)
y2_pred = best_rf_model_y2.predict(X_test)

print(y1_pred[100:150])
print(y2_pred[100:150])",No,4,48.0
"# Save predictions in format used for competition scoring
output = pd.DataFrame({'ForecastId': test_df.ForecastId, 'ConfirmedCases': rnd_y1_pred, 'Fatalities': rnd_y2_pred})
output.to_csv('submission.csv', index=False)
print(output.tail(10))
print('Submission file successfully saved..')",No,5,25.0
"#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
X_train.shape",No,5,58.0
"px.choropleth(df_map, 
              locations=""iso_alpha"", 
              color=""ln(ConfirmedCases)"", 
              hover_name=""Country_Region"", 
              hover_data=[""ConfirmedCases""] ,
              animation_frame=""Date"",
              color_continuous_scale=px.colors.sequential.dense, 
              title='Daily Confirmed Cases growth(Logarithmic Scale)')'",No,5,33.0
model1 = XGBRegressor(n_estimators=1000),No,5,4.0
"px.choropleth(df_map, 
              locations=""iso_alpha"", 
              color=""ln(Fatalities)"", 
              hover_name=""Country_Region"",
              hover_data=[""Fatalities""],
              animation_frame=""Date"",
              color_continuous_scale=px.colors.sequential.OrRd,
              title = 'Daily Deaths growth(Logarithmic Scale)')'",No,4,33.0
"#Compile the model, because this is a binary classification problem, accuracy can be used
#model1.compile(optimizer='Adam', loss= 'mean_squared_error')",No,1,53.0
"training = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
testing = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
data_ = pd.read_csv(""/kaggle/input/covid19-demographic-predictors/covid19_by_country.csv"")",No,5,45.0
"model1.fit(X_train, y_train)#, batch_size = 50, epochs= 20)",No,5,7.0
"train_c = result_c
train_c = scaler.fit_transform(train_c)
train_c= np.array(train_c)
train_c = preprocessing.scale(train_c)
#train_c  = np.reshape(train_c, (train_c.shape[0], train_c.shape[1], 1))",No,4,18.0
"pred_f = model1.predict(train_c)
#pred_f = pred_f.reshape(-1,3)",No,5,27.0
"save_file_f = pd.DataFrame(pred_f, columns=['Fatalities'])
save_file_f.index += 1 ",No,5,55.0
"#Get the top 10 countries
last_date = df_train.Date.max()
df_countries = df_train[df_train['Date']==last_date]
df_countries = df_countries.groupby('Country_Region', as_index=False)['ConfirmedCases','Fatalities'].sum()
df_countries = df_countries.nlargest(10,'ConfirmedCases')
#Get the trend for top 10 countries
df_trend = df_train.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()
df_trend = df_trend.merge(df_countries, on='Country_Region')
df_trend.drop(['ConfirmedCases_y','Fatalities_y'],axis=1, inplace=True)
df_trend.rename(columns={'Country_Region':'Country', 'ConfirmedCases_x':'Cases', 'Fatalities_x':'Deaths'}, inplace=True)
#Add columns for studying logarithmic trends
df_trend['ln(Cases)'] = np.log(df_trend['Cases']+1)# Added 1 to remove error due to log(0).
df_trend['ln(Deaths)'] = np.log(df_trend['Deaths']+1)",No,4,14.0
"#change the name of the 'country' feature to match 'Country_Region' on the train set 
data_['Country_Region']= data_.Country
data_.drop('Country',axis=1,  inplace =True)",No,4,10.0
"px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Cases growth for top 10 worst affected countries')",No,5,75.0
"result = pd.merge(result_c, save_file_f, left_index=True, right_index=True)",No,5,32.0
training.info(),No,5,40.0
"px.line(df_trend, x='Date', y='Deaths', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries')",No,5,75.0
"print(data_.shape)
print(training.shape)",No,5,58.0
result,No,5,41.0
"px.line(df_trend, x='Date', y='ln(Cases)', color='Country', title='COVID19 Cases growth for top 10 worst affected countries(Logarithmic Scale)')",No,5,75.0
"#missing values
training.isnull().sum()",No,5,39.0
"px.line(df_trend, x='Date', y='ln(Deaths)', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries(Logarithmic Scale)')",No,5,75.0
submission = result,No,5,77.0
"#missing values
data_.isnull().sum()",No,5,39.0
"data_['Quarantine_date'] = pd.to_datetime(data_.Quarantine)
data_['Restrictions_date'] = pd.to_datetime(data_.Restrictions)
data_['Schools_date'] = pd.to_datetime(data_.Schools)
data_.drop(['Schools', 'Restrictions', 'Quarantine'], axis =1, inplace = True)",No,4,16.0
"submission['Fatalities'] = [0 if submission.loc[i, 'Fatalities'] < 0 
                                else submission.loc[i, 'Fatalities'] for i in submission.index]",No,5,8.0
training.Date = pd.to_datetime(training.Date),No,5,16.0
 submission,No,5,41.0
"training = training.fillna({'Province_State': 'Unknown'})
testing = testing.fillna({'Province_State': 'Unknown'})",No,5,17.0
"df_us = df_train[df_train['Country_Region']=='US']
df_us['Date'] = df_us['Date'].astype(str)
df_us['state_code'] = df_us.apply(lambda x: us_state_abbrev.get(x.Province_State,float('nan')), axis=1)
df_us['ln(ConfirmedCases)'] = np.log(df_us.ConfirmedCases + 1)
df_us['ln(Fatalities)'] = np.log(df_us.Fatalities + 1)",No,4,8.0
" submission.drop(columns=['Country_Region','Date'], inplace=True)",No,5,10.0
data_.info(),No,5,40.0
"px.choropleth(df_us,
              locationmode=""USA-states"",
              scope=""usa"",
              locations=""state_code"",
              color=""ln(ConfirmedCases)"",
              hover_name=""Province_State"",
              hover_data=[""ConfirmedCases""],
              animation_frame=""Date"",
              color_continuous_scale=px.colors.sequential.Darkmint,
              title = 'Daily Cases growth for USA(Logarithmic Scale)')'",No,5,33.0
"px.choropleth(df_us,
              locationmode=""USA-states"",
              scope=""usa"",
              locations=""state_code"",
              color=""ln(Fatalities)"",
              hover_name=""Province_State"",
              hover_data=[""Fatalities""],
              animation_frame=""Date"",
              color_continuous_scale=px.colors.sequential.OrRd,
              title = 'Daily deaths growth for USA(Logarithmic Scale)')'",No,5,33.0
 submission['Fatalities'].sum(),No,5,40.0
len(submission),No,5,58.0
submission.to_csv('submission.csv'),No,5,25.0
"df_train.Province_State.fillna('NaN', inplace=True)",No,5,17.0
"df_plot = df_train.groupby(['Date','Country_Region','Province_State'], as_index=False)['ConfirmedCases','Fatalities'].sum()",No,5,60.0
"import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import datetime

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
import torch.optim as optim

# set cuda
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')

#set random seed
RANDOM_SEED = 10015
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if USE_CUDA:
    torch.cuda.manual_seed(RANDOM_SEED)
    
# set hyper parameters

# LSTM
NUM_HIDDEN = 256
N_STEP = 66

# data
TIME_STEP = 67
NUM_FEATURES = 2

# train
BATCH_SIZE = TIME_STEP // N_STEP
EPOCHS = 2000
LEARNING_RATE = 0.01
CLIP_VALUE = 1.

# predict
N_PREFIX = 10
N_PREDICT = 33",No,5,77.0
"df = training.groupby(['Country_Region', 'Date'], as_index=False).sum()
df_test = testing.groupby(['Country_Region', 'Date'], as_index=False).sum()",No,5,60.0
df_test[df_test.Country_Region == 'Italy'],No,5,14.0
df[df.Country_Region == 'Italy'],No,5,14.0
len(df.Country_Region.unique()),No,5,54.0
len(df_test.Country_Region.unique()),No,5,54.0
"train = pd.merge(training, data_, on=['Country_Region'], how= 'left')
test  = pd.merge(testing, data_, on=['Country_Region'], how= 'left')",No,5,32.0
"df = df_plot.query(""Country_Region=='India'"")
px.line(df, x='Date', y='ConfirmedCases', title='Daily Cases growth for India')'",No,5,33.0
train.isna().sum(),No,5,39.0
"px.line(df, x='Date', y='Fatalities', title='Daily Deaths growth for India')",No,5,75.0
"data_[data_.Restrictions_date.notnull()][['Country_Region', 'Quarantine_date']]",No,5,14.0
train.loc[(train['Date'] == '2020-03-20') &(train.Country_Region == 'Argentina') ],No,5,14.0
"import pandas as pd
import numpy as np",No,5,22.0
"ch_geojson = ""../input/china-regions-map/china-provinces.json""
df_plot['day'] = df_plot.Date.dt.dayofyear
df_plot['Province_ch'] = """"'",No,3,8.0
"PATH_WEEK='/kaggle/input/covid19-global-forecasting-week-2'
df_train = pd.read_csv(f'{PATH_WEEK}/train.csv')
df_test = pd.read_csv(f'{PATH_WEEK}/test.csv')

df_hospital_beds = pd.read_csv(r""/kaggle/input/hospital-beds/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv"", 
                               skiprows=4)

df_population2 = pd.read_csv(""/kaggle/input/populationdata/population_by_country_2020.csv"", 
                             na_values=""N.A."")

df_environment_pm2 = pd.read_csv(""/kaggle/input/environmentpm25/API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_888986.csv"",
                                 skiprows=4)

df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)
df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)
df_hospital_beds.rename(columns={'Country Name' : 'Country'}, inplace=True)
df_environment_pm2.rename(columns={'Country Name' : 'Country'}, inplace=True)

df_population2.set_axis([""Country"", ""Population"", ""YearlyChange"", 
                      ""NetChange"", ""Density"", ""LandArea"", 
                      ""Migrants"", ""FertilityRate"", ""MedAge"",
                      ""UrbanPop"", ""WorldShare""],
                      axis=1, 
                      inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)

df_train.info()'",No,1,45.0
"test['Quarantine'] = 0
test['Schools'] = 0
test['Restrictions'] = 0

test.loc[(test.Country_Region == 'Argentina'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Austria'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Belgium'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'China'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Colombia'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Denmark'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'France'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Germany'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'India'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Israel'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Italy'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Malaysia'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'New Zealand'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Peru'), 'Quarantine' ] = 1
test.loc[(test.Country_Region == 'Spain'), 'Quarantine' ] = 1

test.loc[(test.Country_Region == 'Israel'), 'Schools' ] = 1

test.loc[(test.Country_Region == 'Israel'), 'Restrictions' ] = 1

test.drop(['Quarantine_date', 'Schools_date', 'Restrictions_date'], axis = 1, inplace = True)",No,4,8.0
"df = df_plot.query(""Country_Region=='China'"")
fig = px.choropleth_mapbox(df,
              geojson=ch_geojson,
              #scope=""asia"",
              color=""ConfirmedCases"",
              locations=""Province_ch"",
              featureidkey=""objects.CHN_adm1.geometries.properties.NL_NAME_1"",
              #featureidkey=""features.properties.name"",
              animation_frame=""day"")
fig.update_geos(fitbounds=""locations"", visible=False)
fig.update_layout(margin={""r"":0,""t"":0,""l"":0,""b"":0})
fig.show()'",No,5,33.0
"train['Quarantine'] = 0
train['Schools'] = 0
train['Restrictions'] = 0

train.loc[(train['Date'] >= '2020-03-20') &(train.Country_Region == 'Argentina'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-16') &(train.Country_Region == 'Austria'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-18') &(train.Country_Region == 'Belgium'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-01-24') &(train.Country_Region == 'China'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-25') &(train.Country_Region == 'Colombia'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-16') &(train.Country_Region == 'Denmark'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-17') &(train.Country_Region == 'France'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-21') &(train.Country_Region == 'Germany'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-23') &(train.Country_Region == 'India'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-08') &(train.Country_Region == 'Italy'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-18') &(train.Country_Region == 'Malaysia'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-23') &(train.Country_Region == 'New Zealand'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-15') &(train.Country_Region == 'Peru'), 'Quarantine' ] = 1
train.loc[(train['Date'] >= '2020-03-15') &(train.Country_Region == 'Spain'), 'Quarantine' ] = 1

train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Schools' ] = 1

train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Restrictions' ] = 1

train.drop(['Quarantine_date', 'Schools_date', 'Restrictions_date'], axis = 1, inplace = True)",No,5,8.0
"train[train.Quarantine == 1][['Country_Region', 'Date']].head(50)",No,5,41.0
"df = df_plot.query(""Country_Region=='China'"")
px.line(df, x='Date', y='ConfirmedCases', color='Province_State', title='Daily Cases growth for China')'",No,5,33.0
"country_names = {'Bahamas, The': 'Bahamas',
'Brunei Darussalam' : 'Brunei',
'DR Congo' :  'Congo (Kinshasa)',
""Cte d'Ivoire"" : ""Cote d'Ivoire"",
'Congo' :  'Congo (Brazzaville)',                 
'Congo, Rep.': 'Congo (Brazzaville)',
'Congo, Dem. Rep.': 'Congo (Kinshasa)',
'Czech Republic (Czechia)' : 'Czechia',
'Czech Republic': 'Czechia',
'Diamond Princess': 'Diamond Princess',
'Egypt, Arab Rep.': 'Egypt',
'Gambia, The': 'Gambia',
'Holy See': 'Holy See',
'Iran, Islamic Rep.': 'Iran',
'Korea, Rep.': 'Korea, South',
'South Korea':'Korea, South',
'Kyrgyz Republic': 'Kyrgyzstan',
'Lao PDR': 'Laos',
'Russian Federation': 'Russia',
'St. Kitts and Nevis': 'Saint Kitts and Nevis',
'Saint Kitts & Nevis' : 'Saint Kitts and Nevis',
'St. Lucia': 'Saint Lucia',
'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
'St. Vincent & Grenadines':'Saint Vincent and the Grenadines',
'Serbia': 'Serbia', 
'Slovak Republic': 'Slovakia', 
'Syrian Arab Republic': 'Syria',
'Taiwan': 'Taiwan*', 
'United States': 'US', 
'Venezuela, RB': 'Venezuela'
}

df_population2.Country.replace(country_names, inplace=True)
df_hospital_beds.Country.replace(country_names, inplace=True)
df_environment_pm2.Country.replace(country_names, inplace=True)

df_train = pd.merge(df_train, df_population2, on=""Country"", how=""left"")
df_test = pd.merge(df_test, df_population2, on=""Country"", how=""left"")

df_hospital_beds.rename(columns={'2011':'HospitalBeds'}, inplace=True)
df_train = pd.merge(df_train, df_hospital_beds[[""Country"", ""HospitalBeds""]], on=""Country"", how=""left"")
df_test = pd.merge(df_test, df_hospital_beds[[""Country"", ""HospitalBeds""]], on=""Country"", how=""left"")

df_environment_pm2.rename(columns={'2017':'PM25'}, inplace=True)
df_train = pd.merge(df_train, df_environment_pm2[[""Country"", ""PM25""]], on=""Country"", how=""left"")
df_test = pd.merge(df_test, df_environment_pm2[[""Country"", ""PM25""]], on=""Country"", how=""left"")
'",Yes,1,8.0
"px.line(df, x='Date', y='Fatalities', color='Province_State', title='Daily Deaths growth for China')",No,5,75.0
data_[data_.Quarantine_date.notnull()],No,5,14.0
"df_train['NumDate'] = df_train.Date.astype(int)/((10**9)*60*60*24)
first_date = df_train.NumDate.min()
df_train.NumDate -= first_date
df_train.head()

df_test['NumDate'] = df_test.Date.astype(int)/((10**9)*60*60*24)
df_test.NumDate -= first_date

outbreak_dates = df_train[['Country', 'NumDate']][df_train.ConfirmedCases>0].groupby('Country', as_index=False).min()
outbreak_dates.columns = ['Country', 'FirstOutbreak']

first_death = df_train[['Country', 'NumDate']][df_train.Fatalities>0].groupby('Country', as_index=False).min()
first_death.columns = ['Country', 'FirstDeath']

df_train = pd.merge(df_train, outbreak_dates, how='left')
df_test = pd.merge(df_test, outbreak_dates, how='left')

df_train = pd.merge(df_train, first_death, how='left')
df_train.FirstDeath.fillna(0,inplace=True)

df_test = pd.merge(df_test, first_death, how='left')
df_test.FirstDeath.fillna(0,inplace=True)

df_train['DaysSinceOutbreak'] = df_train.NumDate - df_train.FirstOutbreak
df_test['DaysSinceOutbreak'] = df_test.NumDate - df_test.FirstOutbreak

df_train['DaysSinceFirstDeath'] = df_train.NumDate - df_train.FirstDeath
df_test['DaysSinceFirstDeath'] = df_test.NumDate - df_test.FirstDeath

df_train.head()",Yes,1,8.0
"def categoricalToInteger(df):
    #convert NaN Province State values to a string
    df.Province_State.fillna('NaN', inplace=True)
    #Define Ordinal Encoder Model
    oe = OrdinalEncoder()
    df[['Province_State','Country_Region']] = oe.fit_transform(df.iloc[:,1:3])
    return df",No,5,20.0
"df_train.WorldShare = df_train.WorldShare.str.rstrip('%').astype('float') / 100.0
df_test.WorldShare = df_test.WorldShare.str.rstrip('%').astype('float') / 100.0
df_train.UrbanPop = df_train.UrbanPop.str.rstrip('%').astype('float') / 100.0
df_test.UrbanPop = df_test.UrbanPop.str.rstrip('%').astype('float') / 100.0
df_train.YearlyChange = df_train.YearlyChange.str.rstrip('%').astype('float') / 100.0
df_test.YearlyChange = df_test.YearlyChange.str.rstrip('%').astype('float') / 100.0",No,5,16.0
"def create_features(df):
    df['day'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['quarter'] = df['Date'].dt.quarter
    df['weekofyear'] = df['Date'].dt.weekofyear
    return df",No,5,8.0
"from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

def log_growth(x, a,r,c):
    return 1.0*c/(1+a * np.exp(-r * x))

def get_log_fit_params(data):
    try:
        popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method=""lm"")
        return popt
    except RuntimeError:
        try:
            popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method=""trf"")
        except RuntimeError:
            return np.zeros(3)

df_popt = df_train.groupby(""Country"").apply(get_log_fit_params)
df_opt = df_popt.apply(pd.Series)
df_opt.set_axis([""a"", ""r"", ""c""], axis=1, inplace=True)
df_opt.head()",No,5,53.0
df_opt[df_opt.a>0],No,5,14.0
"df_train = pd.merge(df_train, df_opt, on=""Country"", how=""left"")
df_test = pd.merge(df_test, df_opt, on=""Country"", how=""left"")",No,5,32.0
"df_train['LogPrediction'] = df_train.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1)
df_test['LogPrediction'] = df_test.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1)",No,5,8.0
train['Quarantine'].any() ==1,No,5,53.0
train[train.Country_Region=='Italy'],No,5,14.0
"def cum_sum(df, date, country, state):
    sub_df = df[(df['Country_Region']==country) & (df['Province_State']==state) & (df['Date']<=date)]
    display(sub_df)
    return sub_df['ConfirmedCases'].sum(), sub_df['Fatalities'].sum()",Yes,3,14.0
"from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

features = ['NumDate', ""Population"", ""YearlyChange"", 
                      ""NetChange"", ""Density"", ""LandArea"", 
                      ""Migrants"", ""FertilityRate"", ""MedAge"",
                     ""UrbanPop"", ""WorldShare"", 'FirstOutbreak', 
                    'DaysSinceOutbreak', 'FirstDeath', 'DaysSinceFirstDeath', 'HospitalBeds', 'PM25', 
                    'LogPrediction', 'a', 'r', 'c']

X = pd.concat([df_train[features], pd.get_dummies(df_train.Country,prefix=""C_""), pd.get_dummies(df_train.State,prefix=""S_"")],axis=1)
y1 = df_train.ConfirmedCases
y2 = df_train.Fatalities


fit1 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y1)
fit2 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y2)

error1 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit1.predict(X)], y1))
error2 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit2.predict(X)], y2))
print(error1)
print(error2)
print((error1+error2)/2)'",Yes,1,11.0
"df_out = pd.DataFrame(df_test.ForecastId)


X = pd.concat([df_test[features], pd.get_dummies(df_test.Country,prefix=""C_""), pd.get_dummies(df_test.State,prefix=""S_"")],axis=1)
df_out['ConfirmedCases'] = [max(x,0) for x in fit1.predict(X)]
df_out['Fatalities'] = [max(x,0) for x in fit2.predict(X)]
df_out.tail()'",Yes,1,11.0
train[train['Restrictions'] == 1],No,5,14.0
train.columns,No,5,71.0
"def train_dev_split(df):
    date = df['Date'].max() - dt.timedelta(days=7)
    return df[df['Date'] <= date], df[df['Date'] > date]",No,5,13.0
"train.hist(figsize=(11,10))",No,5,33.0
"feat_importances = pd.Series(fit1.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending=False).head(20)",No,4,9.0
"df_train = categoricalToInteger(df_train)
df_train = create_features(df_train)",No,5,8.0
"train.drop(['Tests','Test Pop', 'Density', 'Urban Pop', 'sex0', 'sex14',
            'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio', 'lung',
            'Female Lung', 'Male Lung', 'Crime Index', 'Population 2020',
            'Smoking 2016', 'Females 2018', 'Total Infected','Total Deaths',
            'Total Recovered', 'Hospital Bed', 'Median Age', 'GDP 2018'], axis = 1, inplace = True)",No,5,10.0
"prediction_ww = pd.merge(df_test, df_out, on=""ForecastId"")[[""Date"", ""ConfirmedCases"", ""Fatalities""]].groupby(""Date"").sum()
prediction_ww.set_axis([""PredictedCases"", ""PredictedFatalities""], axis=1, inplace=True)
prediction_ww.plot()",No,4,11.0
"df_train, df_dev = train_dev_split(df_train)",No,5,13.0
"test.drop(['Tests','Test Pop', 'Density', 'Urban Pop', 'sex0', 'sex14',
           'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio', 'lung',
           'Female Lung', 'Male Lung', 'Crime Index', 'Population 2020',
           'Smoking 2016', 'Females 2018', 'Total Infected', 'Total Deaths',
           'Total Recovered', 'Hospital Bed', 'Median Age', 'GDP 2018'], axis = 1, inplace = True)",No,5,10.0
"train_ww = df_train[[""Date"", ""ConfirmedCases"", ""Fatalities""]].groupby(""Date"").sum()
train_ww.plot()",No,4,60.0
"pd.merge(train_ww, prediction_ww, how='outer', on=""Date"").plot()'",No,5,32.0
print(train.describe()),No,5,40.0
print(test.describe()),No,5,40.0
test.isna().sum(),No,5,39.0
"columns = ['day','month','dayofweek','dayofyear','quarter','weekofyear','Province_State', 'Country_Region','ConfirmedCases','Fatalities']
df_train = df_train[columns]
df_dev = df_dev[columns]",No,5,10.0
test.Date = pd.to_datetime(test.Date),No,5,16.0
"train = df_train.values
dev = df_dev.values
X_train, y_train = train[:,:-2], train[:,-2:]
X_dev, y_dev = dev[:,:-2], dev[:,-2:]",No,4,21.0
"def create_time_features(df):
    """"""
    Creates time series features from datetime index
    """"""
    df['date'] = df.index
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','month',
           'dayofyear','dayofmonth','weekofyear']]
    return X'",No,5,8.0
"import pandas as pd
#import plotly.express as px
#import matplotlib.pyplot as plt
#import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor
from google.cloud import bigquery
from scipy.spatial.distance import cdist
from sklearn.preprocessing import LabelEncoder",No,5,22.0
"train_df=pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
test_df=pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")",No,5,45.0
train.Date = pd.to_datetime(train.Date),No,5,16.0
"create_time_features(train)
create_time_features(test)",No,5,8.0
"def modelfit(alg, X_train, y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds, show_stdv=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='rmse')
        
    #Predict training set:
    predictions = alg.predict(X_train)
    #predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print(""\
Model Report"")
    #print(""Accuracy : %.4g"" % metrics.accuracy_score(y_train, predictions))
    print(""RMSE Score (Train): %f"" % metrics.mean_squared_error(y_train, predictions))
                    
    feat_imp = pd.Series(alg.feature_importances_).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')'",Yes,2,7.0
"train.drop([""Id"",""Date"", 'date'], axis=1, inplace=True)
test.drop([""Date"", 'date'], axis=1, inplace=True)'",No,5,10.0
"print(""Min train date: "",train_df[""Date""].min())
print(""Max train date: "",train_df[""Date""].max())
print(""Min test date: "",test_df[""Date""].min())
print(""Max test date: "",test_df[""Date""].max())",No,5,40.0
"train_df=train_df[train_df[""Date""]<""2020-03-19""]",No,5,14.0
test_df.isnull().sum(),No,5,39.0
"pop_info = pd.read_csv(""../input/population-by-country-2020/population_by_country_2020.csv"")",No,5,45.0
"b""pop_info.rename(columns={'Density (P/Km)': 'Density'}, inplace=True)""",No,5,61.0
pop_info.columns,No,5,71.0
"country_lookup=pop_info[[""Country (or dependency)"",""Population (2020)"",""Density""]]",No,5,12.0
"pd.DataFrame.from_dict(country_lookup)
train_df_pop=pd.merge(train_df, country_lookup, how='left', left_on='Country_Region', right_on='Country (or dependency)')",No,4,11.0
train_df_pop.info(),No,5,40.0
"train_df_pop.loc[train_df_pop[""Country_Region""]==""US"", [""Population (2020)""]]=331002651 #United Sates
train_df_pop.loc[train_df_pop[""Country_Region""]==""US"", [""Density""]]=36
train_df_pop.loc[train_df_pop[""Country_Region""]==""Korea, South"", [""Population (2020)""]]=51269185 #South Korea
train_df_pop.loc[train_df_pop[""Country_Region""]==""Korea, South"", [""Density""]]=527
train_df_pop.loc[train_df_pop[""Country_Region""]==""Czechia"", [""Population (2020)""]]=10708981 #Czech Republic
train_df_pop.loc[train_df_pop[""Country_Region""]==""Czechia"", [""Density""]]=139
train_df_pop.loc[train_df_pop[""Country_Region""]==""Taiwan*"", [""Population (2020)""]]=23816775 #Taiwan
train_df_pop.loc[train_df_pop[""Country_Region""]==""Taiwan*"", [""Density""]]=673
train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Kinshasa)"", [""Population (2020)""]]=89561403 #DR Congo
train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Kinshasa)"", [""Density""]]=40
train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Brazzaville)"", [""Population (2020)""]]=5518087 #Congo
train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Brazzaville)"", [""Density""]]=16
train_df_pop.loc[train_df_pop[""Country_Region""]==""Cote d'Ivoire"", [""Population (2020)""]]=26378274 #Cte d'Ivoire
train_df_pop.loc[train_df_pop[""Country_Region""]==""Cote d'Ivoire"", [""Density""]]=83
train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Kitts and Nevis"", [""Population (2020)""]]=53199 #Saint Kitts & Nevis
train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Kitts and Nevis"", [""Density""]]=205
train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Vincent and the Grenadines"", [""Population (2020)""]]=110940 #St. Vincent & Grenadines
train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Vincent and the Grenadines"", [""Density""]]=284
train_df_pop.loc[train_df_pop[""Country_Region""]==""Diamond Princess"", [""Population (2020)""]]=3770 #Population and density are same since it is a cruise ship
train_df_pop.loc[train_df_pop[""Country_Region""]==""Diamond Princess"", [""Density""]]=3770'",No,5,8.0
"model1 = XGBRegressor(n_estimators=1000)
model2 = XGBRegressor(n_estimators=1000)",No,5,4.0
"model1.fit(X_train, y_train[:,0],
           eval_set=[(X_train, y_train[:,0]), (X_dev, y_dev[:,0])],
           verbose=False)",No,5,7.0
"model2.fit(X_train, y_train[:,1],
           eval_set=[(X_train, y_train[:,1]), (X_dev, y_dev[:,1])],
           verbose=False)",No,5,7.0
plot_importance(model1);,No,3,79.0
plot_importance(model2);,No,3,79.0
"df_train = categoricalToInteger(df_test)
df_train = create_features(df_test)",No,3,8.0
"columns = ['day','month','dayofweek','dayofyear','quarter','weekofyear','Province_State', 'Country_Region']
df_test = df_test[columns]",No,5,10.0
"y_pred1 = model1.predict(df_test.values)
y_pred2 = model2.predict(df_test.values)",No,5,48.0
df_submit = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv'),No,5,45.0
"df_submit.ConfirmedCases = y_pred1
df_submit.Fatalities = y_pred2",No,5,55.0
"df_submit.to_csv(r'submission.csv', index=False)",No,5,25.0
"# Installing the required libs
!pip install -q fastprogress fastai2 fastcore fast_tabnet --upgrade ",No,5,87.0
"from fastai2.basics import *
from fastai2.tabular.all import *
from fast_tabnet.core import *",No,5,22.0
"import sys
sys.path.insert(0, ""../input/covid19-global-forecasting-week-2/"")
import warnings
warnings.filterwarnings(action='once')'",No,3,23.0
df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') ,No,5,45.0
"df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') 
df_test.head()",Yes,4,45.0
"df['key'] = df['Country_Region'] + '#' + df['Province_State'].fillna('')
df['province_flag'] = np.where(df['Province_State'].isnull(),0,1)
df['Province_State'] = df['Province_State'].fillna(df['Country_Region'])",Yes,3,8.0
"df_test['key'] = df_test['Country_Region'] + '#' + df_test['Province_State'].fillna('')
df_test['province_flag'] = np.where(df_test['Province_State'].isnull(),0,1)
df_test['Province_State'] = df_test['Province_State'].fillna(df_test['Country_Region'])",Yes,5,8.0
df.head(600),No,5,41.0
df_test.head(600),No,5,41.0
"#firstconfirmed = df[(df['ConfirmedCases']>0) & (df['Date']<'2020-03-19')].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()
firstconfirmed = df[(df['ConfirmedCases']>0)].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()",No,5,60.0
firstconfirmed.head(),No,5,41.0
firstconfirmed.shape,No,5,58.0
"firstfatality = df[(df['Fatalities']>0)].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()
#firstfatality = df[(df['Fatalities']>0) & (df['Date']<'2020-03-19')].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()",No,4,60.0
firstfatality.head(),No,5,41.0
firstfatality.shape,No,5,58.0
"firstconfirmed.columns = ['Province_State','Country_Region','FirstCaseDate']
firstfatality.columns = ['Province_State','Country_Region','FirstFatalityDate']",No,5,61.0
"df = df.merge(firstconfirmed, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')
df = df.merge(firstfatality, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')",No,5,32.0
df.head(40),No,5,41.0
"df_test = df_test.merge(firstconfirmed, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')
df_test = df_test.merge(firstfatality, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')",No,5,32.0
df_test.head(40),No,5,41.0
"df['Date']=pd.to_datetime(df['Date'], infer_datetime_format=True) 
df['FirstCaseDate']=pd.to_datetime(df['FirstCaseDate'], infer_datetime_format=True) 
df['FirstFatalityDate']=pd.to_datetime(df['FirstFatalityDate'], infer_datetime_format=True) ",No,5,16.0
"df_test['Date']=pd.to_datetime(df_test['Date'], infer_datetime_format=True) 
df_test['FirstCaseDate']=pd.to_datetime(df_test['FirstCaseDate'], infer_datetime_format=True) 
df_test['FirstFatalityDate']=pd.to_datetime(df_test['FirstFatalityDate'], infer_datetime_format=True) ",No,5,16.0
"df['days_first_case']=(df['Date']-df['FirstCaseDate']).dt.days
df['days_first_fatality']=(df['Date']-df['FirstFatalityDate']).dt.days",No,5,8.0
df['days_first_case'],No,5,41.0
"df_test['days_first_case']=(df_test['Date']-df_test['FirstCaseDate']).dt.days
df_test['days_first_fatality']=(df_test['Date']-df_test['FirstFatalityDate']).dt.days",No,5,8.0
df_test['days_first_case'],No,3,41.0
"df['days_first_case']=np.where(df['days_first_case']<0,0,df['days_first_case'].fillna(0))
df['days_first_fatality']=np.where(df['days_first_fatality']<0,0,df['days_first_fatality'].fillna(0))",No,5,17.0
"df_test['days_first_case']=np.where(df_test['days_first_case']<0,0,df_test['days_first_case'].fillna(0))
df_test['days_first_fatality']=np.where(df_test['days_first_fatality']<0,0,df_test['days_first_fatality'].fillna(0))",No,5,17.0
df.tail(),No,5,41.0
df[df['Country_Region']=='Brazil'].tail(),No,3,41.0
df_test[df_test['Country_Region']=='Brazil'].tail(),No,4,41.0
"add_datepart(df,'Date',drop=False)",No,5,8.0
"add_datepart(df_test,'Date',drop=False)",No,5,13.0
"external = pd.read_csv('/kaggle/input/covid19-week2-external-data/external_data.csv',sep=';',decimal=',')",No,5,45.0
external.head(),No,5,41.0
"df = df.merge(external, left_on='key',right_on='key',how='left')",No,5,32.0
"df_test = df_test.merge(external, left_on='key',right_on='key',how='left')",No,5,32.0
list(df),No,5,41.0
"df['Confirmedlast43'] = df['ConfirmedCases'].shift(43) 
df['Fatalitieslast43'] = df['Fatalities'].shift(43)",No,5,8.0
"df['is_valid'] = np.where(df['Date']<'2020-03-29', False, True)",No,5,8.0
df.groupby('is_valid').size(),No,5,60.0
"df['ConfirmedLog'] = np.log(df['ConfirmedCases']+1)
df['FatalitiesLog'] = np.log(df['Fatalities']+1)",No,5,8.0
"cat_vars = ['Province_State','Country_Region','province_flag']
cont_vars = ['Elapsed',
             'days_first_case',
             'days_first_fatality',
             'pop_density',
             'population',
             'area',
             'lat_min',
             'lat_max',
             'lon_min',
             'lon_max',
             'centroid_x',
             'centroid_y',
             'wdi_country_population',
             'wdi_country_arrivals',
             'wdi_arrivals_per_capita',
             'wdi_gini',
             'wdi_perc_urban_pop',
             'wdi_perc_handwashing',
             'wdi_uhc_coverage',
             'wdi_hospital_beds_p1000',
             'wdi_smoke_prevalence',
             'wdi_diabetes_prevalence',
             'wdi_gdp_per_capita_ppp',
             'wdi_perc_death_comm_diseases',
             'wdi_perc_death_non_comm_diseases',
             'wdi_death_rate_p1000',
             'wdi_perc_basic_sanitation',
             'wdi_dom_govmt_healt_exped_gdp',
             'wdi_dom_govmt_healt_exped_per_cap',
             'wdi_perc_females',
             'wdi_perc_males',
             'wdi_perc_female_20_29',
             'wdi_perc_female_30_39',
             'wdi_perc_female_40_49',
             'wdi_perc_female_50_59',
             'wdi_perc_female_60_69',
             'wdi_perc_female_70_79',
             'wdi_perc_female_80p',
             'wdi_perc_male_20_29',
             'wdi_perc_male_30_39',
             'wdi_perc_male_40_49',
             'wdi_perc_male_50_59',
             'wdi_perc_male_60_69',
             'wdi_perc_male_70_79',
             'wdi_perc_male_80p',
             'wdi_pop_denisty',
             'wdi_perc_anual_growth_pop','Month','Week','Dayofyear']#,'Confirmedlast43','Fatalitieslast43']
dep_vars = ['ConfirmedLog']#,'Fatalities']",No,5,77.0
"procs = [FillMissing, Categorify, Normalize]
splits = ColSplitter('is_valid')(df)",No,2,13.0
splits,No,5,41.0
"to = TabularPandas(df, procs, cat_names=cat_vars, cont_names=cont_vars, y_names=dep_vars, y_block=RegressionBlock(), splits=splits)",No,5,12.0
to,No,5,53.0
cats.shape,No,5,58.0
conts.shape,No,5,58.0
y.shape,No,5,58.0
dls.c,No,5,40.0
learn.load('/kaggle/input/covid19-week2-external-data/best18_2day'),No,5,30.0
"preds, y = learn.get_preds()",No,5,48.0
np.exp(preds)-1,No,5,55.0
np.exp(y)-1,No,5,53.0
raw_test_preds = learn.get_preds(dl=dl),No,5,48.0
raw_test_preds[0],No,4,41.0
raw_test_preds[1],No,3,41.0
preds = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0
preds[:40],No,5,41.0
df['ConfirmedCases'].head(40),No,5,41.0
"preds = pd.DataFrame(preds)
preds.columns = ['pred_confirmed']",No,4,12.0
preds.tail(),No,5,41.0
"df = pd.concat([df, preds], axis=1)",No,5,11.0
preds_confirmed_test = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0
preds_confirmed_test[:40],No,5,41.0
learn.load('/kaggle/input/covid19-week2-external-data/fat_best2_2day'),No,5,30.0
preds,No,5,41.0
y,No,5,53.0
preds_confirmed_test = pd.DataFrame(preds_confirmed_test),No,5,12.0
preds_confirmed_test.columns=['ConfirmedCases'],No,5,61.0
preds_confirmed_test.tail(),No,5,41.0
preds_confirmed_test.shape,No,5,58.0
df_test.shape,No,5,58.0
"df_test = pd.concat([df_test, preds_confirmed_test], axis=1)",No,5,11.0
df_test.tail(40),No,5,41.0
df_test['pred_confirmed']=df_test['ConfirmedCases'],No,5,8.0
preds_fatalities_test = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0
preds_fatalities_test,No,5,41.0
preds_fatalities_test = pd.DataFrame(preds_fatalities_test),No,5,12.0
preds_fatalities_test.columns=['Fatalities'],No,5,61.0
preds_fatalities_test.tail(),No,5,41.0
preds_fatalities_test.shape,No,5,58.0
"df_test = pd.concat([df_test, preds_fatalities_test], axis=1)",No,5,11.0
df_test[df_test['Country_Region']=='Brazil'].head(20),No,3,41.0
sub_ex = df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')  ,No,5,45.0
sub_ex.head(),No,5,41.0
"sub = df_test[['ForecastId','ConfirmedCases','Fatalities']]",No,5,55.0
sub.head(),No,5,41.0
sub_ex.tail(),No,5,41.0
sub.tail(),No,5,41.0
"import os, gc, pickle, copy, datetime, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')",No,2,22.0
"df_train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
print(df_train.shape)
df_train.head()",Yes,4,45.0
"df_test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")
print(df_test.shape)
df_test.head()",Yes,4,45.0
"# concat train and test
df_traintest = pd.concat([df_train, df_test])
print(df_train.shape, df_test.shape, df_traintest.shape)",No,3,11.0
"# process date
df_traintest['Date'] = pd.to_datetime(df_traintest['Date'])
df_traintest['day'] = df_traintest['Date'].apply(lambda x: x.dayofyear).astype(np.int16)
df_traintest.head()",Yes,4,8.0
"# concat Country/Region and Province/State
def func(x):
    try:
        x_new = x['Country_Region'] + ""/"" + x['Province_State']
    except:
        x_new = x['Country_Region']
    return x_new
        
df_traintest['place_id'] = df_traintest.apply(lambda x: func(x), axis=1)
df_traintest.head()'",No,4,8.0
df_traintest[(df_traintest['day']>=day_before_public-3) & (df_traintest['place_id']=='China/Hubei')].head(),No,3,41.0
"# concat lat and long
df_latlong = pd.read_csv(""../input/smokingstats/df_Latlong.csv"")
df_latlong.head()",No,4,45.0
"# concat Country/Region and Province/State
def func(x):
    try:
        x_new = x['Country/Region'] + ""/"" + x['Province/State']
    except:
        x_new = x['Country/Region']
    return x_new
        
df_latlong['place_id'] = df_latlong.apply(lambda x: func(x), axis=1)
df_latlong = df_latlong[df_latlong['place_id'].duplicated()==False]
df_latlong.head()'",No,5,8.0
"df_traintest = pd.merge(df_traintest, df_latlong[['place_id', 'Lat', 'Long']], on='place_id', how='left')
df_traintest.head()",No,5,32.0
"print(pd.isna(df_traintest['Lat']).sum()) # count Nan
df_traintest[pd.isna(df_traintest['Lat'])].head()",No,3,39.0
"

data = pd.read_csv( '/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
test_data = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')

data.head()

",No,4,45.0
"

data.describe()

",No,5,40.0
data.dtypes,No,5,70.0
data['City Group'].unique(),No,5,57.0
data['City'].unique(),No,5,57.0
"#Creating a flag for each type of restaurant
data['Type_IL'] = np.where(data['Type'] == 'IL', 1, 0)
data['Type_FC'] = np.where(data['Type'] == 'FC', 1, 0)
data['Type_DT'] = np.where(data['Type'] == 'DT', 1, 0)

#Creating a flag for 'Big Cities'
data['Big_Cities'] = np.where(data['City Group'] == 'Big Cities', 1, 0)

#Converting Open_Date into day count
#Considering the same date the dataset was made available
data['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(data['Open Date'])).dt.days

#Removing unused columns
data = data.drop('Type', axis=1)
data = data.drop('City Group', axis=1)
data = data.drop('City', axis=1)
data = data.drop('Open Date', axis=1)

#Adjusting test data as well
test_data['Type_IL'] = np.where(test_data['Type'] == 'IL', 1, 0)
test_data['Type_FC'] = np.where(test_data['Type'] == 'FC', 1, 0)
test_data['Type_DT'] = np.where(test_data['Type'] == 'DT', 1, 0)
test_data['Big_Cities'] = np.where(test_data['City Group'] == 'Big Cities', 1, 0)
test_data['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(test_data['Open Date'])).dt.days
test_data = test_data.drop('Type', axis=1)
test_data = test_data.drop('City Group', axis=1)
test_data = test_data.drop('City', axis=1)
test_data = test_data.drop('Open Date', axis=1)",No,4,8.0
"

from sklearn import model_selection
from sklearn import linear_model


X = data.drop(['Id', 'revenue'], axis=1)
Y = data.revenue

",No,5,21.0
"

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn import metrics

def check_rmse(X, Y, alpha):
    RMSE_lasso = []
    RMSE_ridge = []

    for i in alpha:
        lasso = Lasso(alpha=i)
        lasso.fit(X, Y)

        ridge = Ridge(alpha=i)
        ridge.fit(X, Y)

        RMSE_lasso.append(metrics.mean_squared_error(Y, lasso.predict(X)))
        RMSE_ridge.append(metrics.mean_squared_error(Y, ridge.predict(X)))
        
    
    return (RMSE_lasso, RMSE_ridge)

",No,5,84.0
"plt.figure()
plt.plot(alpha, RMSE_lasso, 'o-', color=""r"", label=""RMSE_lasso"")
plt.legend(loc='best')
plt.show()'",No,5,35.0
"lasso = Lasso(alpha=5.5)
lasso.fit(X, Y)

metrics.mean_squared_error(Y, lasso.predict(X))",No,4,7.0
"

model = Lasso(alpha=5.5)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission-lasso-5.5.csv', index=False)
test_predicted.describe()

",No,3,28.0
"

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=150)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission-random-forest.csv', index=False)
test_predicted.describe()

",No,3,7.0
"

model = Ridge(alpha=330)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission-ridge-330.csv', index=False)
test_predicted.describe()

",No,3,7.0
"

model = Lasso(alpha=200000)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission-lasso-high-alpha.csv', index=False)
test_predicted.describe()

",No,4,7.0
"data['Days_Open'].unique()

",No,5,57.0
"

data['Time_Open'] = round(data['Days_Open'] / 700, 0)
data = data.drop('Days_Open', axis=1)

test_data['Time_Open'] = round(test_data['Days_Open'] / 700, 0)
test_data = test_data.drop('Days_Open', axis=1)

",No,4,8.0
"

X = data.drop(['Id', 'revenue'], axis=1)
Y = data.revenue

",No,5,21.0
"model = Ridge(alpha=330)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission.csv', index=False)
test_predicted",No,4,4.0
"

model = Lasso(alpha=200000)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = test_data.Id
test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1))
test_predicted.to_csv('submission-lasso-.csv', index=False)
test_predicted.describe()

",No,4,7.0
"# %%
import numpy as np
import itertools
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from multiprocessing import Pool


class CoronaSim:
    def __init__(self, grid_size, initial_virus, recover_time, speedreaction, incubation, virulence, contactsize=1, num_cores=4):
        self.sim_grid = np.zeros(shape=[grid_size, grid_size])
        ini_x_virus = np.random.randint(
            low=0, high=grid_size, size=initial_virus)
        ini_y_virus = np.random.randint(
            low=0, high=grid_size, size=initial_virus)
        self.inistate_matrix = np.zeros(shape=[grid_size, grid_size])
        self.inistate_matrix.fill(float(recover_time))
        self.recover_time = recover_time
        self.inistate_matrix[ini_x_virus, ini_y_virus] = 7
        self.speedreaction = speedreaction
        self.incubation = incubation
        self.samplesize = contactsize
        self.virulence = virulence
        self.num_cores = num_cores
        self.all_sites = list(itertools.product(
            range(self.sim_grid.shape[0]), range(self.sim_grid.shape[0])))

    def mechanismcheck(self):
        state_value = np.arange(31)
        valuedf = pd.DataFrame(
            {'state': state_value, 'Activity': self.activity(state_value)})
        f1 = px.scatter(valuedf, x=""state"", y=""Activity"")
        f1.data[0].update(mode='markers+lines')
        f1.update_traces(line_color='#B54434',
                         marker_line_width=3, marker_size=4)

        distance = np.arange(200)
        disp = np.exp(-self.gm_virulence(20)*distance**2)
        contactdf = pd.DataFrame({'distance': distance, 'disp': disp})
        f2 = px.line(contactdf, x=""distance"", y=""disp"")
        f2.data[0].update(mode='markers+lines')
        f2.update_traces(line_color='#1B813E',
                         marker_line_width=3, marker_size=4)

        infected_num = np.arange(10000)
        measuredf = pd.DataFrame(
            {'infected_num': infected_num, 'measure': self.gm_virulence(infected_num)})
        f3 = px.line(measuredf, x=""infected_num"", y=""measure"")
        f3.update_traces(line_color='#66327C',
                         marker_line_width=3, marker_size=4)

        trace1 = f1['data'][0]
        trace2 = f2['data'][0]
        trace3 = f3['data'][0]

        fig = make_subplots(rows=3, cols=1, shared_xaxes=False, subplot_titles=(
            ""Figure 1"", ""Figure 2"", ""Figure 3""))
        fig.add_trace(trace1, row=1, col=1)
        fig.add_trace(trace2, row=2, col=1)
        fig.add_trace(trace3, row=3, col=1)

        # Update xaxis properties
        fig.update_xaxes(title_text=""Health state"", row=1, col=1)
        fig.update_xaxes(title_text=""Distance"", range=[10, 50], row=2, col=1)
        fig.update_xaxes(title_text=""The number of infected cases"",
                         showgrid=False, row=3, col=1)

        # Update yaxis properties
        fig.update_yaxes(title_text=""Willingness"", row=1, col=1)
        fig.update_yaxes(title_text=""Contact rate"",
                         showgrid=False, row=2, col=1)
        fig.update_yaxes(
            title_text=""Intensity of the restriction"", row=3, col=1)

        # fig['layout'].update(height=800, width=800, showlegend=False)
        fig.update_layout(
            xaxis=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                    family='Arial',
                    size=12,
                    color='rgb(82, 82, 82)',
                ),
            ),
            yaxis=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                ),
            ),
            xaxis2=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                    family='Arial',
                    size=12,
                    color='rgb(82, 82, 82)',
                ),
            ),
            yaxis2=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                ),
            ),
            xaxis3=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                    family='Arial',
                    size=12,
                    color='rgb(82, 82, 82)',
                ),
            ),
            yaxis3=dict(
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                ),
            ),
            autosize=True,

            plot_bgcolor='white',
            height=800, width=800,
        )
        fig.show()

    def activity(self, state):
        disp = np.exp((state-self.incubation) ** 2 /
                      self.virulence ** 2)
        return disp

    def gm_virulence(self, infected_num):
        return 100*(2/(1+np.exp(-infected_num*self.speedreaction/(self.sim_grid.shape[0]*self.sim_grid.shape[1])))-1)

    def spread_prob(self, x_row, y_col, state, seed=1):
        np.random.seed(seed)
        distance_sites = np.linalg.norm(
            np.array(self.all_sites) - np.array([x_row, y_col]), axis=1)
        Act = self.activity(state)
        gm_virulence = self.gm_virulence(
            infected_num=len(np.where(state < self.recover_time)[0]))
        prob_spread = np.exp(-gm_virulence *
                             distance_sites ** 2) * Act[x_row, y_col] * Act.flatten()
        prob_spread[x_row*self.sim_grid.shape[1]+y_col] = 0
        focal_state = np.random.choice(range(
            self.sim_grid.shape[0]*self.sim_grid.shape[1]), size=self.samplesize, p=prob_spread/sum(prob_spread))
        focal_state_value = 0 if min(state.flatten()[focal_state]) < self.recover_time else self.recover_time
        return focal_state_value

    def simspread(self, t_end, savefile):
        self.savefile = savefile
        state_matrix = self.inistate_matrix
        output_list = []
        parallel_cores = Pool(self.num_cores)
        for t in range(t_end):
            num_infected = len(np.where(state_matrix < self.recover_time)[0])
            print(
                f'At Day {t}, {num_infected} infected cases are confirmed...')
            healthy_individual_index_row = np.where(state_matrix >= self.recover_time)[0]
            healthy_individual_index_col = np.where(state_matrix >= self.recover_time)[1]
            change_state = parallel_cores.starmap(self.spread_prob,
                                                  zip(healthy_individual_index_row, healthy_individual_index_col, itertools.repeat(state_matrix)))
            state_matrix[healthy_individual_index_row,
                         healthy_individual_index_col] = change_state
            state_matrix += 1
            output_list.append(state_matrix.tolist())
        np.savez(self.savefile, *output_list)
        return state_matrix
    
if __name__ == ""__main__"":
    test = CoronaSim(grid_size=100, initial_virus=5, contactsize=2,num_cores=6,
                         recover_time=30, speedreaction=0.01, incubation=10, virulence=25)
    test.mechanismcheck()'",No,5,53.0
"# Start running simulations
result = test.simspread(t_end=10, savefile='test.npz')",No,5,53.0
"# Simulation setup
scenario1 = CoronaSim(grid_size=100, initial_virus=5, contactsize=2, num_cores=6,
                     recover_time=30, speedreaction=0.01, incubation=7, virulence=25)",No,5,53.0
"# %%
import plotly.graph_objects as go
import numpy as np
import pandas as pd

num_infected = []
Day = []
batch_list = []
for batch in range(1, 4):
    savefile = f'../input/simulation-scripts/outfile_s{batch}.npz'
    container = np.load(savefile)
    sim_result = [container[key] for key in container]
    for t in range(len(sim_result)):
        num_infected.append(len(np.where(sim_result[t] < 30)[0]))
    Day.extend(np.arange(len(sim_result)).tolist())
    batch_list.extend(np.repeat(batch, len(sim_result)))

infected_growth_df = pd.DataFrame(
    {'num_infected': num_infected, 'Day': Day, 'batch': batch_list})

# %%


# Add data

fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 1].Day, y=infected_growth_df[infected_growth_df['batch'] == 1].num_infected, name='Speed 0.01',
                         line=dict(color='firebrick', width=4)))
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 2].Day, y=infected_growth_df[infected_growth_df['batch'] == 2].num_infected, name='Speed 0.1',
                         line=dict(color='royalblue', width=4,
                                   dash='dot')))
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 3].Day, y=infected_growth_df[infected_growth_df['batch'] == 3].num_infected, name='Speed 1',
                         line=dict(color='green', width=4,
                                   dash='dash')  # dash options include 'dash', 'dot', and 'dashdot'
                         ))

# Edit the layout
fig.update_layout(title='The influence of government reaction speed on the pandemic development',
                  xaxis_title='Day',
                  yaxis_title='Number of infected cases',
                  xaxis=dict(
                        showline=True,
                        showgrid=False,
                        showticklabels=True,
                        linecolor='rgb(204, 204, 204)',
                        linewidth=2,
                        ticks='outside',
                        tickfont=dict(
                            family='Arial',
                            size=12,
                            color='rgb(82, 82, 82)',
                        ),
                  ),
                  yaxis=dict(
                      showline=True,
                      showgrid=False,
                      showticklabels=True,
                      linecolor='rgb(204, 204, 204)',
                      linewidth=2,
                      ticks='outside',
                      tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                      ),
                  ),
                  autosize=True,
                  plot_bgcolor='white',
                  height=600, width=800
                  )

fig.show()

# %%
",Yes,2,33.0
"# %%
import plotly.graph_objects as go
import numpy as np
import pandas as pd

num_infected = []
Day = []
batch_list = []
for batch in range(1, 4):
    savefile = f'../input/simulation-scripts/outfile_s{batch}.npz'
    container = np.load(savefile)
    sim_result = [container[key] for key in container]
    acc_list = []
    for t in range(1,len(sim_result)):
        acc_list.append(len(np.where(sim_result[t] < 30)[0])-len(np.where(sim_result[t-1] < 30)[0]))
    num_infected.extend(acc_list)
    Day.extend(np.arange(len(sim_result)-1).tolist())
    batch_list.extend(np.repeat(batch, len(sim_result)-1))

infected_growth_df = pd.DataFrame(
    {'num_infected': num_infected, 'Day': Day, 'batch': batch_list})

# %%


# Add data

fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 1].Day, y=infected_growth_df[infected_growth_df['batch'] == 1].num_infected, name='Speed 0.01',
                         line=dict(color='firebrick', width=4),fill='tozeroy'))
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 2].Day, y=infected_growth_df[infected_growth_df['batch'] == 2].num_infected, name='Speed 0.1',
                         line=dict(color='royalblue', width=4,
                                   dash='dot'),fill='tozeroy'))
fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 3].Day, y=infected_growth_df[infected_growth_df['batch'] == 3].num_infected, name='Speed 1',
                         line=dict(color='green', width=4,
                                   dash='dash'),  # dash options include 'dash', 'dot', and 'dashdot'
                         fill='tozeroy'))

# Edit the layout
fig.update_layout(title='',
                  xaxis_title='Day',
                  yaxis_title='Number of newly increase infected cases',
                  xaxis=dict(
                        showline=True,
                        showgrid=False,
                        showticklabels=True,
                        linecolor='rgb(204, 204, 204)',
                        linewidth=2,
                        ticks='outside',
                        tickfont=dict(
                            family='Arial',
                            size=12,
                            color='rgb(82, 82, 82)',
                        ),
                  ),
                  yaxis=dict(
                      showline=True,
                      showgrid=False,
                      showticklabels=True,
                      linecolor='rgb(204, 204, 204)',
                      linewidth=2,
                      ticks='outside',
                      tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                      ),
                  ),
                  autosize=True,

                  plot_bgcolor='white',
                  height=600, width=800,
                  )

fig.show()

# %%
",Yes,5,53.0
"import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np

datafile = '../input/covid19-global-forecasting-week-2/train.csv'
data = pd.read_csv(datafile)
data['PSCR'] = data.Province_State.map(str)+ '' + data.Country_Region.map(str)

region = pd.unique(data['PSCR']).tolist()
f_region = []
time_list = []
region_name = []
for ci in range(len(region)):
    region_data = data[data['PSCR'] == region[ci]]
    region_data = region_data[region_data.ConfirmedCases > 0]
    inc_percentage = (region_data.ConfirmedCases[1:].to_numpy(
    )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy()
    # Only considering the countries with effective data
    if len(np.where(inc_percentage > 0)[0]) > 0:
        inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:]
        f_region.extend(inc_percentage)
        time_list.extend([i for i in range(len(inc_percentage))])
        region_name.extend([region[ci] for i in range(len(inc_percentage))])
    else:
        pass
f_df = pd.DataFrame(
    {'increase': f_region, 'Day': time_list, 'region': region_name})

fig = px.line(f_df, x='Day',
              y='increase', color='region')
fig.update_layout(title='ip patterns',
                  xaxis_title='Day',
                  yaxis_title='Increasing percentage',
                  xaxis=dict(
                        showline=True,
                        showgrid=False,
                        showticklabels=True,
                        linecolor='rgb(204, 204, 204)',
                        linewidth=2,
                        ticks='outside',
                        tickfont=dict(
                            family='Arial',
                            size=12,
                            color='rgb(82, 82, 82)',
                        ),
                  ),
                  yaxis=dict(
                      showline=True,
                      showgrid=False,
                      showticklabels=True,
                      linecolor='rgb(204, 204, 204)',
                      linewidth=2,
                      ticks='outside',
                      tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                      ),
                  ),
                  autosize=True,

                  plot_bgcolor='white',
                  height=600, width=800,
                  )
fig.show()",Yes,2,45.0
"import plotly.express as px
import pandas as pd
import numpy as np

datafile = '../input/covid19-global-forecasting-week-2/train.csv'
data = pd.read_csv(datafile)

# %%
all_region_data = data[pd.isna(data['Province_State'])]
region = ['Japan', 'Israel']
# region = pd.unique(all_region_data['Country_Region']).tolist()
f_region = []
time_list = []
region_name = []
for ci in range(len(region)):
    region_data = data[data['Country_Region'] == region[ci]]
    region_data = region_data[region_data.ConfirmedCases > 0]
    inc_percentage = (region_data.ConfirmedCases[1:].to_numpy(
    )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy()
    # Only considering the countries with effective data
    if len(np.where(inc_percentage > 0)[0]) > 0:
        inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:]
        f_region.extend(inc_percentage)
        time_list.extend([i for i in range(len(inc_percentage))])
        region_name.extend([region[ci] for i in range(len(inc_percentage))])
    else:
        pass
f_df = pd.DataFrame(
    {'increase': f_region, 'Day': time_list, 'region': region_name})


# %%
sim_data = []
speed = [0.01,0.1,1]
for batch in range(1,4):
    result = f'../input/simulation-scripts/outfile_s{batch}.npz'
    container = np.load(result)
    speed_batch = f'Sim: speed {speed[batch-1]}'

    sim_result = [container[key] for key in container]
    num_infected = []
    for t in range(len(sim_result)):
        num_infected.append(len(np.where(sim_result[t] < 30)[0]))

    inc_infected = [(num_infected[i+1]-num_infected[i])/num_infected[i]
                    for i in range(len(num_infected)-1)]
    infected_growth_df = pd.DataFrame({'increase': inc_infected, 'Day': [
        i for i in range(len(sim_result)-1)], 'region': speed_batch})
    sim_data.append(infected_growth_df)
sim_df = pd.concat(sim_data)
# %%
newf = f_df.append(sim_df)

# %%
fig = px.line(newf, x='Day',
              y='increase', color='region')
fig.update_layout(title='ip patterns of Japan and Israel against 3 simulations',
                  xaxis_title='Day',
                  yaxis_title='Increasing percentage',
                  xaxis=dict(
                        showline=True,
                        showgrid=False,
                        showticklabels=True,
                        linecolor='rgb(204, 204, 204)',
                        linewidth=2,
                        ticks='outside',
                        tickfont=dict(
                            family='Arial',
                            size=12,
                            color='rgb(82, 82, 82)',
                        ),
                  ),
                  yaxis=dict(
                      showline=True,
                      showgrid=False,
                      showticklabels=True,
                      linecolor='rgb(204, 204, 204)',
                      linewidth=2,
                      ticks='outside',
                      tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                      ),
                  ),
                  autosize=True,

                  plot_bgcolor='white',
                  height=400, width=600,
                  )

fig.show()",Yes,5,53.0
"# %%
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd


class plotresult:
    def __init__(self, savefile):
        container = np.load(savefile)
        self.sim_result = [container[key] for key in container]

    def infectiongrowth(self):
        num_infected = []
        for t in range(len(self.sim_result)):
            num_infected.append(len(np.where(self.sim_result[t] < 30)[0]))
        infected_growth_df = pd.DataFrame({'num_infected': num_infected, 'Day': [
                                          i for i in range(len(self.sim_result))]})
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=infected_growth_df.Day, y=infected_growth_df['num_infected'], name=""AAPL High"",
                                 line_color='deepskyblue'))

        fig.update_layout(title_text='Infection growth',
                          xaxis_rangeslider_visible=True)
        fig.show()

    def infectionheatmap(self):
        infect_dis = []
        col = []
        row = []
        days = []
        for t in range(len(self.sim_result)):
            temp_re = self.sim_result[t].tolist()
            flatten_re = [item for sublist in temp_re for item in sublist]
            x_co = np.tile(range(len(temp_re)), len(temp_re))
            y_co = np.repeat(range(len(temp_re)), len(temp_re))
            day_series = np.repeat(t, len(temp_re)**2)

            infect_dis.extend(flatten_re)
            col.extend(x_co)
            row.extend(y_co)
            days.extend(day_series)

        heatmapdf = pd.DataFrame(
            {'dis': infect_dis, 'Day': days, 'col': col, 'row': row})
        fig = px.scatter(heatmapdf, x=""col"", y=""row"", color='dis', animation_frame=""Day"",
                         color_continuous_scale=[(0, ""#81C7D4""), (0.2, ""#D0104C""), (1, ""#81C7D4"")])
        fig.update_layout(title='The pandemic development',
                          xaxis_title='',
                          yaxis_title='',
                          xaxis=dict(
                              showline=False,
                              showgrid=False,
                              showticklabels=False,
                          ),
                          yaxis=dict(
                              showline=False,
                              showgrid=False,
                              showticklabels=False,
                          ),
                          autosize=True,
                          plot_bgcolor='white',
                          height=600, width=600,
                          coloraxis_colorbar=dict(
                              title=""Healthy state""
                          )
                          )

        fig.show()


        # %%
if __name__ == ""__main__"":
    result = '../input/simulation-scripts/outfile_s1.npz'
    testplot = plotresult(result)
    # testplot.infectiongrowth()
    testplot.infectionheatmap()

# %%
'",Yes,3,22.0
"# get place list
places = np.sort(df_traintest['place_id'].unique())
print(len(places))",No,3,57.0
"# calc cases, fatalities per day
df_traintest2 = copy.deepcopy(df_traintest)
df_traintest2['cases/day'] = 0
df_traintest2['fatal/day'] = 0
tmp_list = np.zeros(len(df_traintest2))
for place in places:
    tmp = df_traintest2['ConfirmedCases'][df_traintest2['place_id']==place].values
    tmp[1:] -= tmp[:-1]
    df_traintest2['cases/day'][df_traintest2['place_id']==place] = tmp
    tmp = df_traintest2['Fatalities'][df_traintest2['place_id']==place].values
    tmp[1:] -= tmp[:-1]
    df_traintest2['fatal/day'][df_traintest2['place_id']==place] = tmp
print(df_traintest2.shape)
df_traintest2[df_traintest2['place_id']=='China/Hubei'].head()",No,2,12.0
"pipeline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
pipeline.fit(train_numeric_X, train_numeric_Y)
predicted = pipeline.predict(test_numeric_X)",Yes,3,7.0
from sklearn.svm import SVR,No,5,22.0
"pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', SVR())])
pipeline.fit(train_numeric_X, train_numeric_Y.values[:,0])
pipeline2 = Pipeline([('scaler', StandardScaler()), ('estimator', SVR())])
pipeline2.fit(train_numeric_X, train_numeric_Y.values[:,1])
discovered, fatal = pipeline.predict(test_numeric_X), pipeline2.predict(test_numeric_X)",No,3,7.0
"predicted_x1 =  pipeline.predict(train_numeric_X)
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=train_numeric_Y['ConfirmedCases'],
    histnorm='percent',
    name='actual discovered', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=-4.0,
        end=3.0,
        size=0.5
    ),
    opacity=0.75
))
fig.add_trace(go.Histogram(
    x=predicted_x1,
    histnorm='percent',
    name='predicted discovered',
    xbins=dict(
        start=-3.0,
        end=4,
        size=0.5
    ),
    opacity=0.75
))

fig.update_layout(
    title_text='SVR Histogram of ConfirmedCases', # title of plot
    xaxis_title_text='bins', # xaxis label
    yaxis_title_text='ConfirmedCases', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()",No,5,56.0
"predicted_x2 =  pipeline2.predict(train_numeric_X)
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=train_numeric_Y['Fatalities'],
    histnorm='percent',
    name='actual died', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=-4.0,
        end=3.0,
        size=0.5
    ),
    opacity=0.75
))
fig.add_trace(go.Histogram(
    x=predicted_x2,
    histnorm='percent',
    name='predicted died',
    xbins=dict(
        start=-3.0,
        end=4,
        size=0.5
    ),
    opacity=0.75
))

fig.update_layout(
    title_text='SVR Histogram of Fatalities', # title of plot
    xaxis_title_text='bins', # xaxis label
    yaxis_title_text='Fatalities', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()",No,5,56.0
"from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
outcomes = []
    
fold = 0
for train_index, test_index in kf.split(train_numeric_X):
    fold += 1
    X_train, X_test = train_numeric_X.values[train_index], train_numeric_X.values[test_index]
    y_train, y_test = train_numeric_Y['ConfirmedCases'].values[train_index], train_numeric_Y['ConfirmedCases'].values[test_index]
    pipeline.fit(X_train, y_train)
    predictions = RF_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    outcomes.append(accuracy)
    print(""Fold {0} accuracy: {1}"".format(fold, accuracy))     
mean_outcome = np.mean(outcomes)
print(""\
\
Mean Accuracy: {0}"".format(mean_outcome)) '",No,3,7.0
from sklearn.neighbors import KNeighborsClassifier,No,5,22.0
"pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', KNeighborsClassifier(n_jobs=4))])
pipeline.fit(train_numeric_X, train_numeric_Y)
predicted_x = pipeline.predict(train_numeric_X)",No,4,7.0
"fig = go.Figure()
fig.add_trace(go.Scatter(
    x=train_numeric_Y['ConfirmedCases'],
    y=train_numeric_Y['Fatalities'],
    marker=dict(color=""crimson"", size=12),
    mode=""markers"",
    name=""actual"",
))

fig.add_trace(go.Scatter(
    x=predicted_x[:,0],
    y=predicted_x[:,1],
    marker=dict(color=""lightseagreen"", size=8),
    mode=""markers"",
    name=""predicted"",
))

fig.update_layout(title=""RF result"",
                  xaxis_title=""ConfirmedCases"",
                  yaxis_title=""Fatalities"")

fig.show()'",No,5,56.0
from sklearn.ensemble import RandomForestClassifier,No,5,22.0
"RF_model = RandomForestClassifier(n_estimators=50, n_jobs=4, max_depth=5)
RF_model.fit(train_numeric_X, train_numeric_Y)
predicted = RF_model.predict(test_numeric_X)",No,3,7.0
predicted_x = RF_model.predict(train_numeric_X),No,5,27.0
"fig = go.Figure()
fig.add_trace(go.Histogram(
    x=train_numeric_Y['ConfirmedCases'],
    histnorm='percent',
    name='actual discovered', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=-4.0,
        end=3.0,
        size=0.5
    ),
    opacity=0.75
))
fig.add_trace(go.Histogram(
    x=predicted_x[:,0],
    histnorm='percent',
    name='predicted discovered',
    xbins=dict(
        start=-3.0,
        end=4,
        size=0.5
    ),
    opacity=0.75
))

fig.update_layout(
    title_text='RF Histogram of ConfirmedCases', # title of plot
    xaxis_title_text='bins', # xaxis label
    yaxis_title_text='ConfirmedCases', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()",No,5,84.0
"from sklearn.metrics import make_scorer, accuracy_score
accuracy_score(train_numeric_Y['ConfirmedCases'], predicted_x[:,0]), accuracy_score(train_numeric_Y['Fatalities'], predicted_x[:,1])",No,4,49.0
" from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
outcomes = []
    
fold = 0
for train_index, test_index in kf.split(train_numeric_X):
    fold += 1
    X_train, X_test = train_numeric_X.values[train_index], train_numeric_X.values[test_index]
    y_train, y_test = train_numeric_Y['ConfirmedCases'].values[train_index], train_numeric_Y['ConfirmedCases'].values[test_index]
    RF_model.fit(X_train, y_train)
    predictions = RF_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    outcomes.append(accuracy)
    print(""Fold {0} accuracy: {1}"".format(fold, accuracy))     
mean_outcome = np.mean(outcomes)
print(""\
\
Mean Accuracy: {0}"".format(mean_outcome)) '",No,3,7.0
from sklearn.ensemble import AdaBoostClassifier,No,5,22.0
"adaboost_model_for_ConfirmedCases = AdaBoostClassifier(n_estimators=5)
adaboost_model_for_ConfirmedCases.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[0]])
adaboost_model_for_Fatalities = AdaBoostClassifier(n_estimators=5)
adaboost_model_for_Fatalities.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[1]])",No,4,7.0
"predicted_x1 = adaboost_model_for_ConfirmedCases.predict(train_numeric_X)
predicted_x2 = adaboost_model_for_Fatalities.predict(train_numeric_X)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=train_numeric_Y['ConfirmedCases'],
    y=train_numeric_Y['Fatalities'],
    marker=dict(color=""crimson"", size=12),
    mode=""markers"",
    name=""actual"",
))

fig.add_trace(go.Scatter(
    x=predicted_x1,
    y=predicted_x2,
    marker=dict(color=""lightseagreen"", size=8),
    mode=""markers"",
    name=""predicted"",
))

fig.update_layout(title=""ADB result"",
                  xaxis_title=""ConfirmedCases"",
                  yaxis_title=""Fatalities"")

fig.show()'",No,4,56.0
train_y_pred = RF_model.predict(train_numeric_X),No,5,27.0
"plt.figure(figsize=(12,8))
plt.hist([train_numeric_Y['ConfirmedCases'],train_y_pred[:,0]],bins=100, range=(1,100), label=['ConfirmedCases_actual','ConfirmedCases_pred'],alpha=0.75)
plt.title('ConfirmedCases Comparison',fontsize=20)
plt.xlabel('sample',fontsize=20)
plt.ylabel('match',fontsize=20)
plt.legend()
plt.show()",No,5,33.0
"plt.figure(figsize=(12,8))
plt.hist([train_numeric_Y['Fatalities'],train_y_pred[:,1]],bins=100, range=(1,100), label=['Fatalities_actual','Fatalities_pred'],alpha=0.75)
plt.title('Fatalities Comparison',fontsize=20)
plt.xlabel('sample',fontsize=20)
plt.ylabel('match',fontsize=20)
plt.legend()
plt.show()",No,5,56.0
"error = np.sqrt((train_y_pred - train_numeric_Y)**2)
error = error.cumsum()",No,5,28.0
"fig,ax = plt.subplots()
 
plt.xlabel('sample')
plt.ylabel('error')
plt.subplot(2, 1, 1)
plt.plot(range(len(error)), error['ConfirmedCases'], ""x-"",label=""ConfirmedCases"",color='orange')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(range(len(error)), error['Fatalities'], ""+-"", label=""Fatalities"")
plt.legend()

plt.show()'",No,5,56.0
"from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(train_numeric_Y, train_y_pred , squared=False)
rmse",No,4,49.0
"plt.bar(range(len(numeric_features_X)), RF_model.feature_importances_, tick_label=numeric_features_X)
plt.xlabel('feature')
plt.ylabel('weight')
plt.xticks(rotation=90)
plt.show()",No,5,33.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

",No,5,88.0
"DT=1
optimize_model=False
Make_submission=True
n_estimators=200 #400 #500  #1500
max_depth=2 #4 #12  #8",No,5,59.0
"train=pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"")
train.head()",No,4,45.0
"train[train['Province_State'].notna()].groupby(['Country_Region'], sort=False)['Province_State'].nunique()",No,5,54.0
"def add_location(df_old):
    df=df_old.copy()
    df['Date']=pd.to_datetime(df['Date'])
    df['Country_Region']=df['Country_Region'].fillna('')
    df['Province_State']=df['Province_State'].fillna('')
    df['location']=df['Province_State'].astype('str')+"" ""+df['Country_Region'].astype('str')
    return df'",No,3,17.0
train=add_location(train),No,5,8.0
"Confirm_pivot=pd.pivot_table(train_df,index='Date',columns='Country_Region',
                             values='ConfirmedCases',aggfunc=np.sum)",No,5,53.0
"plt.figure(figsize=(16,8))
colors=['r','b','g','y','orange','purple','m','hotpink','violet','darkgreen','navy','brown']
for i,country in enumerate(list_countries):
    Confirm=Confirm_pivot[Confirm_pivot[country]>0][country].diff().fillna(0)
    Confirm=Confirm[Confirm>0]
    Confirm.plot(color=colors[i],label=country,markersize=12,lw=5)    
    plt.title('Number of Daily Cases',fontsize=15)
    plt.legend(title='country')
plt.tight_layout()",No,5,33.0
"plt.figure(figsize=(20,16))
colors=['r','b','g','y','orange','purple','m','hotpink','violet','darkgreen','navy','brown']
for i,country in enumerate(list_countries):
    Confirm=Confirm_pivot[Confirm_pivot[country]>0][country].diff().fillna(0)
    Confirm=Confirm[Confirm>0]
    
    plt.subplot(4,3,i+1)
    Confirm.plot(color=colors[i],label=country,markersize=12,lw=5)    
    plt.xticks()
    plt.legend(title='Country')
    plt.title('Number of Daily Cases in {}'.format(country.upper()))
plt.tight_layout()",No,5,33.0
"train.set_index('location',inplace=True)

train['day_of_year']=train['Date'].dt.dayofyear
train['day_of_week']=train['Date'].dt.dayofweek


first_day=train[(train['ConfirmedCases']>0)].groupby(['location'], sort=False)['day_of_year'].min()
first_day.rename('first_day',inplace=True)",No,4,60.0
"def add_days_passed(df_old,first_day):
    df=df_old.copy()
    df=pd.concat([df,first_day],axis=1,join='inner')
    df['days_passed']=df['day_of_year']-df['first_day']
    df.drop(columns=['first_day'],inplace=True)
    df['location']=df.index
    df.set_index('Id',inplace=True)
    df['Id']=df.index
    return df

 
",No,4,8.0
"train=add_days_passed(train,first_day)

train.head()",No,5,8.0
"country_stat=pd.read_csv('../input/countryinfo/covid19countryinfo.csv')
country_stat = country_stat[country_stat['region'].isnull()] 

def add_country_stat(old_df,country_stat):
    df=old_df.copy()
    df=df.merge(country_stat[['country','pop','medianage','sex65plus','lung','smokers','density']],left_on=['Country_Region'],right_on=['country'],how='left')
    df.drop(columns=['country'],inplace=True)
    
    df['pop']=df['pop'].fillna(1000)
    df['pop']=df['pop'].apply(lambda x: int(str(x).replace(',', '')))
    #df['gdp2019']=df['gdp2019'].fillna(0)
    #df['gdp2019']=df['gdp2019'].apply(lambda x: int(str(x).replace(',', '')))
    #df['gdp2019']=df['gdp2019']/df['pop']
    
    
    df['density']=df['density'].fillna(0)
    df['medianage']=df['medianage'].fillna(0)
    #df['sexratio']=df['sexratio'].fillna(1)
    df['sex65plus']=df['sex65plus'].fillna(1)
    df['lung']=df['lung'].fillna(24)
    df['smokers']=df['smokers'].fillna(24)
    #df['lung']=df['lung']*df['pop']
    
    return df
    

train=add_country_stat(train,country_stat)",Yes,3,45.0
country_stat.info(),No,5,40.0
"border_info=pd.read_csv(""https://raw.githubusercontent.com/geodatasource/country-borders/master/GEODATASOURCE-COUNTRY-BORDERS.CSV"")
border_info.drop(columns=[""country_code"",""country_border_code""],inplace=True)
border_info.replace({'United States of America':'US',
                    'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
                    'Bolivia (Plurinational State Of)':'Bolivia',
                    'Brunei Darussalam':'Brunei',
                    'Gambia (the)':'Gambia',
                     'Congo':'Congo (Kinshasa)',
                     'Cote dIvoire':""Cote d'Ivoire"",
                     ""Iran (Islamic Republic of)"":'Iran',
                     ""Korea (the Republic of)"":'Korea, South',
                    ""Lao People's Democratic Republic"":'Laos',
                     ""Moldova (the Republic of)"":'Moldova',
                     ""Russian Federation"":'Russia',
                    ""Syrian Arab Republic"":'Syria',
                     ""Taiwan (Province of China)"":'Taiwan*',
                    ""Tanzania (the United Republic of)"":'Tanzania',
                     ""Venezuela (Bolivarian Republic of)"":'Venezuela',
                     ""Viet Nam"":'Vietnam'},inplace=True)
border_info=border_info.fillna("""")
border_info.to_csv(""border_info.csv"")
'",Yes,2,45.0
"from itertools import product as it_product
def expand_grid(data_dict):
  rows = it_product(*data_dict.values())
  return pd.DataFrame.from_records(rows, columns=data_dict.keys())",No,4,12.0
"skel=expand_grid({'Index':border_info.index,'Date':train['Date'].unique()})
skel.info()",No,4,12.0
"country_info=train.groupby(['Date','Country_Region'])['ConfirmedCases'].sum()",No,5,60.0
"skel=expand_grid({'Index':border_info.index,'Date':train['Date'].unique()})

skel=skel.merge(border_info, how='inner', left_on=['Index'],right_index=True)
skel=skel.merge(country_info, how='inner', 
                left_on=['Date','country_border_name'],right_on=['Date','Country_Region'])",No,4,32.0
"from datetime import timedelta
skel['Date']=skel['Date']+timedelta(days=DT)
border_cases=skel.groupby(['country_name','Date'])['ConfirmedCases'].sum()
len(skel['country_name'].unique())",No,3,8.0
"train=train.merge(border_cases, how='left', left_on=['Country_Region','Date'],right_on=['country_name','Date'])
train['ConfirmedCases_y']=train['ConfirmedCases_y'].fillna(0)
train.rename(columns={'ConfirmedCases_y':'ConfirmedCases_neighbors','ConfirmedCases_x':'ConfirmedCases'},inplace=True)",No,4,32.0
"big_train = pd.concat([train,pd.get_dummies(train['location'], prefix='loc')],axis=1)
big_train['ConfirmedCases_neighbors']=np.log1p(big_train['ConfirmedCases_neighbors'])
big_train.reset_index(inplace=True)
big_train.drop(columns=[""Id""],inplace=True)'",No,3,11.0
big_train.shape,No,5,58.0
"def df_add_deltas(df_old):
    df=df_old.copy()
    df=df.sort_values(by=['location', 'Date'])
    df['d_ConfirmedCases'] = df.groupby(['location'])['ConfirmedCases'].diff()
    df['d_Fatalities'] = df.groupby(['location'])['Fatalities'].diff()
    df.loc[df['d_Fatalities']<0,'d_Fatalities']=0
    df.loc[df['d_ConfirmedCases']<0,'d_ConfirmedCases']=0
    
    df['prev_ConfirmedCases']=df['ConfirmedCases']-df['d_ConfirmedCases']
    df['prev_Fatalities']=df['Fatalities']-df['d_Fatalities']
    
    df['prev_ConfirmedCases']=np.log1p(df['prev_ConfirmedCases'])
    df['prev_Fatalities']=np.log1p(df['prev_Fatalities'])
    
    first_day_stat=df[df['Date']=='2020-01-22']
    df.drop(df[df['Date']=='2020-01-22'].index, inplace=True)
    
    return df,first_day_stat
    ",No,5,8.0
"big_train,first_day_stat=df_add_deltas(big_train)",No,5,8.0
"big_train.reset_index(inplace=True,drop=True)",No,5,84.0
"X=big_train.drop(columns=['Province_State','Country_Region','Date','ConfirmedCases','Fatalities','location',
                          'd_ConfirmedCases','d_Fatalities'])

y=big_train['d_ConfirmedCases']
y_2=big_train['d_Fatalities']",No,4,10.0
"max_day=X['day_of_year'].max()
mask_train=X['day_of_year']<max_day-DT+1
mask_test=X['day_of_year']>=max_day-DT+1",No,4,8.0
"X_train=X[mask_train]
X_test=X[mask_test]


y_train=y[mask_train]
y_test=y[mask_test]

y_train_2=y_2[mask_train]
y_test_2=y_2[mask_test]",No,5,21.0
X_test['day_of_year'].nunique(),No,5,54.0
"X_train.drop(columns=['day_of_year'],inplace=True)  #including day of year makes things worse RMLSE goes up from 0.49 to 0.7
X_test.drop(columns=['day_of_year'],inplace=True)   #including day of year makes things worse RMLSE goes up from 0.49 to 0.7

X_train.drop(columns=['day_of_week'],inplace=True)  #including day of week makes things worse RMLSE goes up from 0.49 to 0.57
X_test.drop(columns=['day_of_week'],inplace=True)   #including day of week makes things worse RMLSE goes up from 0.49 to 0.57

X.drop(columns=['day_of_year'],inplace=True)  
X.drop(columns=['day_of_week'],inplace=True)   

",No,5,10.0
"X.drop(columns=['index'],inplace=True)   
X_train.drop(columns=['index'],inplace=True)
X_test.drop(columns=['index'],inplace=True)",No,5,10.0
"# Best: -0.252369 using {'max_depth': 6, 'n_estimators': 1500}

# Best: -1.051575 using {'max_depth': 6, 'n_estimators': 500} - predict shifts log

# Best: -278.598983 using {'max_depth': 10, 'n_estimators': 500} - predict values

# Best: -1.111758 using {'max_depth': 6, 'n_estimators': 500} - predict shifts log, knowing prev log 

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

if optimize_model:

    model = xgb.XGBRegressor(random_state=42)
    n_estimators_grid = [500, 1000,1500]
    max_depth_grid = [6, 8, 10]
    param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid)
    grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1)
    grid_result = grid_search.fit(X,np.log1p(y))
    # summarize results
    print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_))
    print(grid_result.cv_results_)'",No,3,6.0
"# Best: -0.211438 using {'max_depth': 5, 'n_estimators': 2500}

# Best: -0.974302 using {'max_depth': 5, 'n_estimators': 400} - predict shifts log

# Best: -274.964946 using {'max_depth': 12, 'n_estimators': 500}

# Best: -1.064197 using {'max_depth': 5, 'n_estimators': 400}

if optimize_model:

    model = xgb.XGBRegressor(random_state=42)
    n_estimators_grid = [400,500,600]
    max_depth_grid = [5,6,7]
    param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid)
    grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1)
    grid_result = grid_search.fit(X,np.log1p(y))
    # summarize results
    print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_))
    print(grid_result.cv_results_)'",Yes,3,6.0
"#Best: -0.211107 using {'max_depth': 5, 'n_estimators': 3000}

#Best: -0.940498 using {'max_depth': 4, 'n_estimators': 400}

#Best: -274.964946 using {'max_depth': 12, 'n_estimators': 500}

#Best: -0.861262 using {'max_depth': 2, 'n_estimators': 200}


if optimize_model:

    model = xgb.XGBRegressor(random_state=42)
    n_estimators_grid = [150,200,250]
    max_depth_grid = [1,2,3]
    param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid)
    grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1)
    grid_result = grid_search.fit(X,np.log1p(y))
    # summarize results
    print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_))
    print(grid_result.cv_results_)'",Yes,3,6.0
"reg.fit(X_train,np.log1p(y_train))",No,5,7.0
y_pred = reg.predict(X_test),No,5,48.0
"from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_pred,np.log1p(y_test)))",No,5,49.0
"X_train_2=X_train.copy()
X_train_2['d_confirmed']=y_train  #0.4412899060661785 <- without , with - 0.4463  
X_test_2=X_test.copy()
X_test_2['d_confirmed']=y_pred",No,4,49.0
"reg_2.fit(X_train_2,np.log1p(y_train_2))",No,5,7.0
y_pred_2 = reg_2.predict(X_test_2),No,5,48.0
"np.sqrt(mean_squared_error(y_pred_2,np.log1p(y_test_2)))",No,5,49.0
"test=pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"")
test.rename(columns={'ForecastId':'Id'},inplace=True)
test=add_location(test)

test.set_index('location',inplace=True)

test['day_of_year']=test['Date'].dt.dayofyear
test['day_of_week']=test['Date'].dt.dayofweek
test=add_days_passed(test,first_day)
test=add_country_stat(test,country_stat)'",Yes,2,45.0
"big_train=big_train.drop(columns=[""index""])
",No,5,10.0
"known=big_train['Date'].unique()
print(known)",No,5,57.0
"if Make_submission==True:
    results=[]

    for d in days_to_predict:
        print(""Predicting {}"".format(d))
        if d in known:
            print(""Data Known"")
        
            X=big_train.drop(columns=['Province_State','Country_Region','ConfirmedCases','Fatalities','location','Date',
                                  'day_of_year','day_of_week','d_ConfirmedCases','d_Fatalities'])

            y=big_train['d_ConfirmedCases']
            y_2=big_train['d_Fatalities']
        
            mask_train=big_train['Date']<d
            mask_val=big_train['Date']==d
        
            X_train=X[mask_train]
            y_train=y[mask_train]
            y_train_2=y_2[mask_train]
        
            X_val=X[mask_val]
            y_val=y[mask_val]
            y_val_2=y_2[mask_val]
        
            reg = xgb.XGBRegressor(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
            reg_2 = xgb.XGBRegressor(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
        
            reg.fit(X_train,np.log1p(y_train))
        
            y_pred = reg.predict(X_val)
            print(""MSLE {}"".format(mean_squared_error(y_pred,np.log1p(y_val))))
        
            X_train_2=X_train.copy()
            X_train_2['d_ConfirmedCases']=np.log1p(y_train)  #0.4412899060661785 <- without , with - 0.4463  
            X_val_2=X_val.copy()
            X_val_2['d_ConfirmedCases']=y_pred
        
            reg_2.fit(X_train_2,np.log1p(y_train_2))
        
            y_pred_2 = reg_2.predict(X_val_2)
        
            print(""MSLE {}"".format(mean_squared_error(y_pred_2,np.log1p(y_val_2))))
        
        #result=X_test[['']]
        elif d-np.timedelta64(86400000000000,'ns') in known:
            print(""Data Known"")
        
            X=big_train.drop(columns=['Province_State','Country_Region','ConfirmedCases','Fatalities','location','Date',
                                  'day_of_year','day_of_week','d_ConfirmedCases','d_Fatalities'])

            y=big_train['d_ConfirmedCases']
            y_2=big_train['d_Fatalities']
        
            mask_train=big_train['Date']<d
        
            X_train=X[mask_train]
            y_train=y[mask_train]
            y_train_2=y_2[mask_train]
        
        
            reg = xgb.XGBRegressor(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
            reg_2 = xgb.XGBRegressor(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
        
            reg.fit(X_train,np.log1p(y_train))
        
            X_train_2=X_train.copy()
            X_train_2['d_ConfirmedCases']=np.log1p(y_train)  #0.4412899060661785 <- without , with - 0.4463  
            
            reg_2.fit(X_train_2,np.log1p(y_train_2))
        
        
        X_test=test[test['Date']==d]
    
        day=X_test['day_of_year'].iloc[0]
    
        country_info=big_train[big_train['day_of_year']==day-1].groupby(['Country_Region'])['ConfirmedCases'].sum()
    
        border_cases=border_info.merge(country_info, how='inner', 
                left_on=['country_border_name'],right_on=['Country_Region'])
    
        border_cases=border_cases.groupby(['country_name'])['ConfirmedCases'].sum()
        border_cases=border_cases.rename('ConfirmedCases_neighbors')
    
        X_test=X_test.merge(border_cases, how='left', left_on=['Country_Region'],right_on=['country_name'])
        X_test['ConfirmedCases_neighbors']=X_test['ConfirmedCases_neighbors'].fillna(0)
    
        X_test = pd.concat([X_test,pd.get_dummies(X_test['location'], prefix='loc')],axis=1)
        X_test['ConfirmedCases_neighbors']=np.log1p(X_test['ConfirmedCases_neighbors'])
        
        X_test=X_test.merge(big_train[big_train['day_of_year']==day-1][['location','ConfirmedCases','Fatalities']], how='left', 
                 left_on=['location'],right_on=['location'])
        X_test.rename(columns={'ConfirmedCases':'prev_ConfirmedCases','Fatalities':'prev_Fatalities'},inplace=True)
        
        X_test['prev_ConfirmedCases']=np.log1p(X_test['prev_ConfirmedCases'])
        X_test['prev_Fatalities']=np.log1p(X_test['prev_Fatalities'])
        
    
        X_test.set_index('Id',inplace=True)
    
    #print(X_test.head(5))
    
        y_test=reg.predict(X_test.drop(columns=['Province_State','Country_Region','location','Date','day_of_year','day_of_week']))
    
    #print(y_test)
    
        X_test['d_ConfirmedCases']=y_test
    
        y_test=reg_2.predict(X_test.drop(columns=['Province_State','Country_Region','location','Date',
                                            'day_of_year','day_of_week']))
    
        X_test['d_Fatalities']=y_test
    
    #print(X_test.shape)
    
        X_test['Id']=X_test.index
    
        X_test=X_test.merge(big_train[big_train['day_of_year']==day-1][['location','ConfirmedCases','Fatalities']], how='left', 
                 left_on=['location'],right_on=['location'])
    
    #print(X_test.head(5))
    
    #X_test.set_index('Id',inplace=True)
    
    #print(X_test.shape)
    
        X_test.set_index('Id',inplace=True)
    
        print(X_test.head(5))
        
        X_test['d_ConfirmedCases']=np.expm1(X_test['d_ConfirmedCases'])
        X_test['d_Fatalities']=np.expm1(X_test['d_Fatalities'])
    
        X_test['ConfirmedCases']+=X_test['d_ConfirmedCases']
        X_test['Fatalities']+=X_test['d_Fatalities']
       
    
        results.append(X_test[['ConfirmedCases','Fatalities']])
    
        if not d in known: #Needed to correctly get data on neighbors         
            big_train=pd.concat([big_train,X_test])
    
    
        '",Yes,2,21.0
"if Make_submission==True:
    submission=pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")
    submission.drop(columns=['ConfirmedCases','Fatalities'],inplace=True)
    submission=submission.merge(pd.concat(results),left_on=['ForecastId'],right_index=True).clip(lower=0)
    submission.to_csv('submission.csv',index=False)'",Yes,3,25.0
"# KAGGLE competition >> Prediction of confirmed cases
# This script trains the model on the latest dataset and predicts the next value
# Author: Neilay Khasnabish


#  Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
import tqdm as tqdm

# Making Kaggle dataset ready
def kaggle(dfTrain, dfTest):
    pd.set_option('display.max_columns', None)

    dfTest['DateNew'] = pd.to_datetime(dfTest['Date'])
    dfTest = dfTest.drop(['Date'], axis=1)
    dfTest = dfTest.rename(columns={""DateNew"": ""Date""})
    dfTest['Year'] = dfTest['Date'].dt.year
    dfTest['Month'] = dfTest['Date'].dt.month
    dfTest['Day'] = dfTest['Date'].dt.day
    dfTest = dfTest.drop(['Date'], axis=1)
    dfTest = dfTest.fillna('DummyProvince')
    #dfTest.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTest.csv')

    dfTrain['DateNew'] = pd.to_datetime(dfTrain['Date'])
    dfTrain = dfTrain.drop(['Date'], axis=1)
    dfTrain = dfTrain.rename(columns={""DateNew"": ""Date""})
    dfTrain['Year'] = dfTrain['Date'].dt.year
    dfTrain['Month'] = dfTrain['Date'].dt.month
    dfTrain['Day'] = dfTrain['Date'].dt.day
    dfTrain = dfTrain.drop(['Date'], axis=1)
    dfTrain = dfTrain.fillna('DummyProvince')
    #dfTrain.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTrain.csv')

    result = pd.merge(dfTest, dfTrain, how='left', on=['Country_Region', 'Province_State', 'Year', 'Month', 'Day'])
    result = result.fillna(-1)

    # Clutter removal
    [rr, cc] = np.shape(result)
    for iQuit in range(rr):
        if result.loc[iQuit, 'Day'] == 4 :
            result.loc[iQuit, 'ConfirmedCases'] = -1
            result.loc[iQuit, 'Fatalities'] = -1

    #result.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/temp.csv')
    return result


# Finding RMSE
def ErrorCalc(mdl, ref, tag):
    relError = np.abs(mdl - ref)/ np.abs(ref+1)
    MeanErrorV = np.mean(relError)
    print(tag + ': Mean Rel Error in %: ', MeanErrorV * 100)
    return MeanErrorV


# Since cumulative prediction >> This script is not used for Kaggle dataset
def AdjustingErrorsOutliers(tempPred, df) :
    tempPred = np.round(tempPred)
    tempPrev = df['day5'].to_numpy() # Next cumulative prediction must be more than or equal to previous
    for i in range(len(tempPred)):
        if tempPred[i] < tempPrev[i] : # Since cumulative prediction
            tempPred[i] = tempPrev[i]
    return tempPred


# Train model
def TrainMdl (trainIpData, trainOpData) :

    testSize = 0.1 # 90:10 ratio >> for final testing

    print('Training starts ...')

    randomState=None
    # randomState = 42 # For train test split

    # Final validation
    X_train, X_test, y_train, y_test = train_test_split(trainIpData, trainOpData, test_size=testSize, random_state=randomState)

    # Another set of input
    TrainIP = X_train[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]
    TrainOP = X_train['gammaFun']
    TestIP = X_test[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]
    TestOP = X_test['gammaFun']


    # Adaboost Regressor >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    treeDepth = 10 # Fixed
    mdl = DecisionTreeRegressor(max_depth=treeDepth) # This is fixed
    param_grid = {
        'n_estimators': [100, 250, 500],
        'learning_rate': [0.1, 0.01, 0.001]
                    }
    regrMdl = AdaBoostRegressor(base_estimator=mdl)
    clf = RandomizedSearchCV(estimator = regrMdl, param_distributions = param_grid,
                                         n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)
    clf.fit(TrainIP, TrainOP)


    # Calculating Error
    y_predictedTrain = clf.predict(TrainIP) # Predicting the gamma function
    y_predictedTrain = AdjustingErrorsOutliers(y_predictedTrain * TrainIP['day5'].to_numpy(), TrainIP)
    ErrorCalc(y_predictedTrain, y_train.to_numpy(), 'Train Data-set') # y_predictedTrain converted to numbers

    y_predictedTest = clf.predict(TestIP) # Predicting the gamma function
    y_predictedTest = AdjustingErrorsOutliers(y_predictedTest * TestIP['day5'].to_numpy(), TestIP)
    ErrorCalc(y_predictedTest, y_test.to_numpy(), 'Validation Data-set ') # y_predictedTest converted to numbers

    print('-----------------------------------------------------------')

    # Read Kaggle dataset
    dfTrain = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
    dfTest = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")
    df = kaggle(dfTrain, dfTest)

    print('Prediction starts ...')
    [rr, cc] = np.shape(df)
    for iP in range(rr):
        if df.loc[iP, 'ConfirmedCases'] == -1 : # iP-th position need to be predicted
            # Create a dataframe
            day5 = df.loc[iP-1, 'ConfirmedCases']
            day4 = df.loc[iP-2, 'ConfirmedCases']
            day3 = df.loc[iP-3, 'ConfirmedCases']
            day2 = df.loc[iP-4, 'ConfirmedCases']
            day1 = df.loc[iP-5, 'ConfirmedCases']
            diff1 = day5 - day4
            diff2 = day4 - day3
            diff3 = day3 - day2
            diff4 = day2 - day1
            data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5],
                    'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]}
            dfPredict = pd.DataFrame(data)
            finalPrediction = clf.predict(dfPredict[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]) * day5
            if finalPrediction < day5 :
                finalPrediction = day5
            df.loc[iP, 'ConfirmedCases'] = np.round(finalPrediction) # Update the current location

    return df


# Main code starts
df = pd.read_csv(""../input/processedtimedata/TrainTest.csv"") # Processed dta from JHU
trainIpData = df[['day1', 'day2', 'day3', 'day4', 'day5', 'gammaFun', 'diff1', 'diff2', 'diff3', 'diff4']]
trainOpData = df['dayPredict'] # Predicted confirmed case
predictions_dF = TrainMdl (trainIpData, trainOpData) # Kaggle data will be read inside
print('Completed ...')

#predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/submission_ConfirmedCases.csv', index = False)'",Yes,1,7.0
"# KAGGLE competition >> Fatality rate
# This script trains the model on the latest dataset and predicts the next value
# Author: Neilay Khasnabish


#  Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

# Making Kaggle dataset ready
def kaggle(dfTrain, dfTest):
    pd.set_option('display.max_columns', None)

    dfTest['DateNew'] = pd.to_datetime(dfTest['Date'])
    dfTest = dfTest.drop(['Date'], axis=1)
    dfTest = dfTest.rename(columns={""DateNew"": ""Date""})
    dfTest['Year'] = dfTest['Date'].dt.year
    dfTest['Month'] = dfTest['Date'].dt.month
    dfTest['Day'] = dfTest['Date'].dt.day
    dfTest = dfTest.drop(['Date'], axis=1)
    dfTest = dfTest.fillna('DummyProvince')
    #dfTest.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTest.csv')

    dfTrain['DateNew'] = pd.to_datetime(dfTrain['Date'])
    dfTrain = dfTrain.drop(['Date'], axis=1)
    dfTrain = dfTrain.rename(columns={""DateNew"": ""Date""})
    dfTrain['Year'] = dfTrain['Date'].dt.year
    dfTrain['Month'] = dfTrain['Date'].dt.month
    dfTrain['Day'] = dfTrain['Date'].dt.day
    dfTrain = dfTrain.drop(['Date'], axis=1)
    dfTrain = dfTrain.fillna('DummyProvince')
    #dfTrain.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTrain.csv')

    result = pd.merge(dfTest, dfTrain, how='left', on=['Country_Region', 'Province_State', 'Year', 'Month', 'Day'])
    result = result.fillna(-1)

    # Clutter removal
    [rr, cc] = np.shape(result)
    for iQuit in range(rr):
        if result.loc[iQuit, 'Day'] == 4 :
            result.loc[iQuit, 'ConfirmedCases'] = -1
            result.loc[iQuit, 'Fatalities'] = -1

    #result.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/temp.csv')
    return result


# Finding RMSE
def ErrorCalc(mdl, ref, tag):
    relError = np.abs(mdl - ref)/ np.abs(ref+1)
    MeanErrorV = np.mean(relError)
    print(tag + ': Mean Rel Error in %: ', MeanErrorV * 100)
    return MeanErrorV


# Since cumulative prediction >> This script is not used for Kaggle dataset
def AdjustingErrorsOutliers(tempPred, df) :
    tempPred = np.round(tempPred)
    tempPrev = df['day5'].to_numpy() # Next cumulative prediction must be more than or equal to previous
    for i in range(len(tempPred)):
        if tempPred[i] < tempPrev[i] : # Since cumulative prediction
            tempPred[i] = tempPrev[i]
    return tempPred


# Train model
def TrainMdl (trainIpData, trainOpData) :


    testSize = 0.1 # 90:10 ratio >> for final testing

    print('Training starts ...')

    randomState=None
    # randomState = 42 # For train test split

    # Final validation
    X_train, X_test, y_train, y_test = train_test_split(trainIpData, trainOpData, test_size=testSize, random_state=randomState)

    # Another set of input
    TrainIP = X_train[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]
    TrainOP = X_train['gammaFun']
    TestIP = X_test[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]
    TestOP = X_test['gammaFun']


    # Adaboost Regressor >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    treeDepth = 10 # Fixed
    mdl = DecisionTreeRegressor(max_depth=treeDepth) # This is fixed
    param_grid = {
        'n_estimators': [100, 250, 500],
        'learning_rate': [0.1, 0.01, 0.001]
                    }
    regrMdl = AdaBoostRegressor(base_estimator=mdl)
    clf = RandomizedSearchCV(estimator = regrMdl, param_distributions = param_grid,
                                         n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)
    clf.fit(TrainIP, TrainOP)


    # Calculating Error
    y_predictedTrain = clf.predict(TrainIP) # Predicting the gamma function
    y_predictedTrain = AdjustingErrorsOutliers(y_predictedTrain * TrainIP['day5'].to_numpy(), TrainIP)
    ErrorCalc(y_predictedTrain, y_train.to_numpy(), 'Train Data-set') # y_predictedTrain converted to numbers

    y_predictedTest = clf.predict(TestIP) # Predicting the gamma function
    y_predictedTest = AdjustingErrorsOutliers(y_predictedTest * TestIP['day5'].to_numpy(), TestIP)
    ErrorCalc(y_predictedTest, y_test.to_numpy(), 'Validation Data-set ') # y_predictedTest converted to numbers

    print('-----------------------------------------------------------')


    # Read Kaggle dataset
    dfTrain = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
    dfTest = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")
    df = kaggle(dfTrain, dfTest)


    [rr, cc] = np.shape(df)
    for iP in range(rr):
        if df.loc[iP, 'Fatalities'] == -1 : # iP-th position need to be predicted
            # Create a dataframe
            day5 = df.loc[iP-1, 'Fatalities']
            day4 = df.loc[iP-2, 'Fatalities']
            day3 = df.loc[iP-3, 'Fatalities']
            day2 = df.loc[iP-4, 'Fatalities']
            day1 = df.loc[iP-5, 'Fatalities']
            diff1 = day5 - day4
            diff2 = day4 - day3
            diff3 = day3 - day2
            diff4 = day2 - day1
            data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5],
                    'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]}
            dfPredict = pd.DataFrame(data)
            finalPrediction = clf.predict(dfPredict[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]) * day5
            if finalPrediction < day5 :
                finalPrediction = day5
            df.loc[iP, 'Fatalities'] = np.round(finalPrediction) # Update the current location


    return df


# Main code starts
df =  pd.read_csv(""../input/processedtimedata/TrainTest_Fatality.csv"") # Processed dta from JHU
trainIpData = df[['day1', 'day2', 'day3', 'day4', 'day5', 'gammaFun', 'diff1', 'diff2', 'diff3', 'diff4']]
trainOpData = df['dayPredict'] # Predicted fatality
fatality_dF = TrainMdl (trainIpData, trainOpData) # Kaggle data will be read inside
print('Completed ...')
#predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/submission_Fatality.csv', index = False)'",Yes,1,7.0
"# Creating the submission
predictions_dF['Fatalities'] = fatality_dF['Fatalities']
predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('submission.csv', index=False)
print(predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].head(10))
print(predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].tail(10))",No,4,25.0
"b""# import necessary modules\nimport numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport os\nimport warnings\nfrom datetime import datetime\nfrom scipy import stats\nfrom scipy.stats import norm, skew, probplot \n\nwarnings.filterwarnings('ignore')\ndaybasecount = 4                  #antal dage der summeres over til estimat for kurvefaktorer\nbaseday = 98 - float(daybasecount-1)/2.  #89.5  # var 86          #den dag i ret hvor der regnes ud fra\nexponent = 1./float(daybasecount) #exponent der overstter daybasecount increase til daglig increase\nfatalityBaseDayShift = 10         #antal dage baslns der beregnes ddsfald fra\nmaxincrease = 140                 # strste tilladte increase mlt i procent over 4 dage\nmaxDeadPrDay = 1500""",No,4,77.0
"#print(os.listdir(""../kaggle-Covid19/covid19-global-forecasting-week-2""))
dftrain = pd.read_csv('../input/covid19-global-forecasting-week-3/train.csv', parse_dates=['Date']).sort_values(by=['Country_Region', 'Date'])
dftest    = pd.read_csv('../input/covid19-global-forecasting-week-3/test.csv', parse_dates=['Date']).sort_values(by=['Country_Region', 'Date'])


# CURVE SMOOTHING
#Add averages
#dftrain ['Cases_m'] =  dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(1)) 
#dftrain ['Cases_p']  = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(-1)) 
#dftrain ['Cases_ave'] = 0.5*(dftrain['ConfirmedCases']+0.5*(dftrain['Cases_p']+dftrain['Cases_m']))
#case_cols = ['ConfirmedCases','Cases_m','Cases_p','Cases_ave']

#dftrain ['Fatalities_m'] =  dftrain.groupby(['Country_Region', 'Province_State'])[['Fatalities']].transform(lambda x: x.shift(1)) 
#dftrain ['Fatalities_p']  = dftrain.groupby(['Country_Region', 'Province_State'])[['Fatalities']].transform(lambda x: x.shift(-1)) 
#dftrain ['Fatalities_ave'] = 0.5*(dftrain['Fatalities']+0.5*(dftrain['Fatalities_p']+dftrain['Fatalities_m']))
#fata_cols = ['Fatalities','Fatalities_m','Fatalities_p','Fatalities_ave']

#date_max = dftrain[""Date""].max()
#mask = df[""Date""]==date_max
#dftrain.loc[mask,'Cases_ave']         = 0.75*dftrain.loc[mask,'ConfirmedCases']+0.25*dftrain.loc[mask,'Cases_m']
#dftrain.loc[mask,'Fatalities_ave']    = 0.75*dftrain.loc[mask,'Fatalities']    +0.25*dftrain.loc[mask,'Fatalities_m']
#dftrain.drop(['Cases_m', 'Cases_p', 'Fatalities_m','Fatalities_p'],axis=1,inplace=True)
#dftrain.drop(['ConfirmedCases','Fatalities'],axis=1,inplace = True)
#dftrain.rename(columns={'Cases_ave':'ConfirmedCases','Fatalities_ave':'Fatalities'},inplace=True)
#dftrain['ConfirmedCases','Fatalities'].fillna(0, inplace=True)

ppp_tabel = pd.read_csv('../input/country-ppp/Country_PPP.csv', sep='\\s+')#.sort_values(by=['Country'])
ppp_tabel.drop('Id', 1,inplace=True)
ppp_tabel = ppp_tabel.append({'Country' : 'Burma' , 'ppp' : 8000} , ignore_index=True)
ppp_tabel = ppp_tabel.append({'Country' : 'MS_Zaandam' , 'ppp' : 40000} , ignore_index=True)
ppp_tabel = ppp_tabel.append({'Country' : 'West_Bank_and_Gaza' , 'ppp' : 20000} , ignore_index=True)

ppp_tabel[""Country""].replace( '_',' ', regex=True,inplace=True)  # _ var indfrt for at f den til at lse
ppp_tabel[""Country""].replace( 'United States','US', regex=True,inplace=True)  # _ var indfrt for at f den til at lse
ppp_tabel.rename(columns={'Country':'Country_Region'},inplace=True)
ppp_tabel.sort_values('Country_Region',inplace=True)'",No,4,45.0
"dftrain['Dayofyear'] = dftrain['Date'].dt.dayofyear
dftest['Dayofyear'] = dftest['Date'].dt.dayofyear
dftest['Expo'] = dftest['Dayofyear']-baseday
print(dftrain.tail(5))
dftest = dftest.merge(dftrain[['Country_Region','Province_State','Date','ConfirmedCases','Fatalities']]
                      , on=['Country_Region','Province_State','Date'], how='left', indicator=True)
",No,4,32.0
"#dftrain = dftrain.loc[dftrain['Country_Region'] == 'Denmark']

dftrain['Province_State'].fillna(dftrain['Country_Region'], inplace=True)
dftest ['Province_State_orig'] = dftest ['Province_State']
dftest ['Province_State'].fillna(dftest['Country_Region'], inplace=True)

dftrain.sort_values(by =['Country_Region', 'Province_State','Date'], inplace=True)
dftrain[['NewCases','NewFatalities']] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases','Fatalities']].transform(lambda x: x.diff()) 
dftrain['FatalityBasis'] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(fatalityBaseDayShift)) 

#smid alt andet end senete bort
dftrain = dftrain.loc[dftrain['Dayofyear'] > 80]

#find sidste dato med oplysninger
to_sum = ['Country_Region','Province_State','ConfirmedCases','Fatalities']
lastinfo = dftrain.groupby(['Country_Region','Province_State']).tail(1)[to_sum]
lastinfo.rename(columns={'ConfirmedCases':'ConfirmedCases_init','Fatalities':'Fatalities_init'},inplace=True)

#find gennemsnit af sidste 4(=daybasecount) dage
to_sum = ['ConfirmedCases','NewCases','FatalityBasis']
grouped = dftrain.groupby(['Country_Region','Province_State']).tail(daybasecount)
grouped_gem = dftrain.groupby(['Country_Region','Province_State'])[to_sum].mean()
grouped_gem.reset_index(inplace=True)
grouped_gem.rename(columns={'ConfirmedCases':'ConfirmedCases_base','Fatalities':'Fatalities_base'
                                ,'NewCases':'NewCases_base'},inplace=True)
grouped_gem = grouped_gem.merge(lastinfo, on=['Country_Region','Province_State'], how='outer', indicator=True)
                       
to_sum = ['NewCases','NewFatalities','FatalityBasis']
grouped2 = grouped.groupby(['Country_Region'])[to_sum].sum()
grouped2['FatalityPct'] = 100*grouped2['NewFatalities']/grouped2['FatalityBasis']

grouped2.rename(columns={'NewCases':'NewCases2','NewFatalities':'NewFatalities2'
                         ,'FatalityBasis':'FatalityBasis2','FatalityPct':'FatalityPct2'},inplace=True)


with_ppp = pd.merge(grouped2, ppp_tabel, on=['Country_Region'], how='outer', indicator=True)
missing = with_ppp.loc[with_ppp['ppp'].isnull()]
dftrain.head(60)
",Yes,2,60.0
"b""#find gennemsnit af forrige 4(=daybasecount) dage\ngrouped=dftrain.groupby(['Country_Region','Province_State']).tail(daybasecount*2)\ngrouped=grouped.groupby(['Country_Region','Province_State']).head(daybasecount)\ngrouped.drop(['FatalityBasis'],axis=1,inplace=True)\n\nto_sum = ['NewCases','NewFatalities']\ngrouped1 = grouped.groupby(['Country_Region'])[to_sum].sum()\n\ngrouped1.rename(columns={'NewCases':'NewCases1','NewFatalities':'NewFatalities1'}, inplace=True)\n\n# beregn grundlggende increase ud fra sidst og forrige 4(=daybasecount) dage\ngrouped = pd.merge(grouped1, grouped2, on=['Country_Region'])\ngrouped['CasesIncreasePct'] = 100*(grouped['NewCases2']/grouped['NewCases1']-1)\nmask = grouped['CasesIncreasePct'] > maxincrease\ngrouped.loc[mask,'CasesIncreasePct'] = maxincrease\nmask = grouped['CasesIncreasePct'] < 0\ngrouped.loc[mask,'CasesIncreasePct'] = 0\nmask = grouped['CasesIncreasePct'].isnull()\ngrouped.loc[mask,'CasesIncreasePct'] = 0\ngrouped['Factor'] = (grouped['CasesIncreasePct']/100+1)**exponent\n\ngrouped = pd.merge(grouped, ppp_tabel, on=['Country_Region'])\n#grouped['ppp'].isnull().sum()\n\n# afgrns Fatality procent ud fra hndestimerede kurver med ppp \ngrouped['ppp'] = grouped['ppp']/10000.\nif False:\n    mask = (grouped['FatalityPct2'] > 9) & (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 5\n    mask = (grouped['FatalityPct2'] < 5) & (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 5\n    mask = (grouped['FatalityPct2'] > 6) & (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 6\n    mask = (grouped['FatalityPct2'] < 1.5) & (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 1.5\n    mask = (grouped['FatalityPct2'] > (9.5 - 0.43*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (9.5 - 0.43*grouped['ppp'])\n    mask = (grouped['FatalityPct2'] < (5.6 - 0.5*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (5.6 - 0.5*grouped['ppp'])\n    mask = (grouped['FatalityPct2'].isnull()) &  (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 7\n    mask = (grouped['FatalityPct2'].isnull()) &  (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 4\n    mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (7.5 - 0.5*grouped['ppp'])\nelse:\n    mask = (grouped['FatalityPct2'] > 4) & (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 3\n    mask = (grouped['FatalityPct2'] < 1) & (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 2\n    mask = (grouped['FatalityPct2'] > 1.5) & (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 1.5\n    mask = (grouped['FatalityPct2'] < 0.5) & (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 0.5\n    mask = (grouped['FatalityPct2'] > (4.5 - 0.43*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (4.5 - 0.43*grouped['ppp'])\n    mask = (grouped['FatalityPct2'] < (1.1 - 0.1*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (1.1 - 0.1*grouped['ppp'])\n    mask = (grouped['FatalityPct2'].isnull()) &  (grouped['ppp'] <= 1)\n    grouped.loc[mask,'FatalityPct2'] = 3\n    mask = (grouped['FatalityPct2'].isnull()) &  (grouped['ppp'] >= 7)\n    grouped.loc[mask,'FatalityPct2'] = 1\n    mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n    grouped.loc[mask,'FatalityPct2'] = (2.6 - 0.23*grouped['ppp'])""",Yes,2,14.0
"dftest.drop('_merge',axis=1,inplace= True)
dftest = dftest.merge(grouped[['Country_Region','FatalityPct2','Factor']], on=['Country_Region'], how='left')
dftest = dftest.merge(grouped_gem[['Province_State','Country_Region','ConfirmedCases_base','ConfirmedCases_init','NewCases_base','Fatalities_init','FatalityBasis']], on=['Province_State','Country_Region'], how='left')
",No,4,32.0
"dftest['ConfirmedCases_shift'] = dftest.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(1)) 

mask = dftest['ConfirmedCases'].isnull()
# find new cases
dftest.loc[mask,'NewCases'] = dftest.loc[mask,'NewCases_base']*(dftest.loc[mask,'Factor']**dftest.loc[mask,'Expo'])

#dftest.loc[mask,'Confirmed'] = dftest.loc[mask,'FatalityBasis2']*(dftest.loc[mask,'Factor']**dftest.loc[mask,'Expo'])
dftest['NewCases_cum'] = dftest.groupby(['Country_Region', 'Province_State'])[['NewCases']].cumsum() #transform(lambda x: x.shift(1)) 
dftest.loc[mask,'ConfirmedCases'] = dftest.loc[mask,'ConfirmedCases_init'] + dftest.loc[mask,'NewCases_cum']

#juster Fatality nr der er rigtig mange cases dvs. der testes meget
mask3 = dftest['ConfirmedCases'] > 400000
dftest.loc[mask3,'FatalityPct2'] = dftest.loc[mask3,'FatalityPct2']*0.7
mask4 = dftest['ConfirmedCases'] > 800000
dftest.loc[mask4,'FatalityPct2'] = dftest.loc[mask4,'FatalityPct2']*0.7
dftest['FatalityBasis'] = dftest.groupby(['Country_Region', 'Province_State'])[
                                                ['ConfirmedCases']].transform(lambda x: x.shift(10)) 
dftest.loc[mask,'NewFatalities'] = dftest.loc[mask,'FatalityBasis'] * dftest.loc[mask,'FatalityPct2']/100
# st max tal for antal dde pr. dag
mask2 = dftest['NewFatalities']   > maxDeadPrDay
dftest.loc[mask2,'NewFatalities'] = maxDeadPrDay
#print(""MASK2"",mask2.sum())

dftest['NewFatalities_cum'] = dftest.groupby(['Country_Region', 'Province_State'])[['NewFatalities']].cumsum() #transform(lambda x: x.shift(1)) 
dftest.loc[mask,'Fatalities'] = dftest.loc[mask,'Fatalities_init'] + dftest.loc[mask,'NewFatalities_cum']
'",Yes,3,8.0
"# Forbered aflevering 
dftest.drop(['Dayofyear',
       'Expo','FatalityPct2', 'Factor',
       'ConfirmedCases_base', 'ConfirmedCases_init',
       'NewCases_base', 'Fatalities_init', 'FatalityBasis',
       'ConfirmedCases_shift',
       'NewCases', 'NewCases_cum', 'NewFatalities','NewFatalities_cum'],axis=1,inplace=True)
final = dftest.groupby(['Country_Region','Province_State']).tail(1)
dftest.drop(['Province_State'],axis=1,inplace=True)
dftest.rename(columns={'Province_State_orig':'Province_State'},inplace=True)",No,4,10.0
"mask = dftest[""ConfirmedCases""].isnull()
print(mask.sum())
errors = dftest.loc[mask]
print(errors)
mask = dftest[""Fatalities""].isnull()
print(mask.sum())
errors = dftest.loc[mask]
print(errors)
dftest.drop(['Province_State','Country_Region','Date'],axis=1,inplace=True)
print(""dftest columns ="",dftest.columns)
'",Yes,4,39.0
"#print(dftest[dftest['Country_Region']=='Burma'])       
dftest.ForecastId = dftest.ForecastId.astype('int')

dftest['ConfirmedCases'] = dftest['ConfirmedCases'].round().astype(int)
dftest['Fatalities'] = dftest['Fatalities'].round().astype(int)

dftest.to_csv('submission.csv', index=False)
",Yes,4,25.0
"import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import time
from datetime import datetime
from scipy import integrate, optimize
import warnings
warnings.filterwarnings('ignore')

# ML libraries
import lightgbm as lgb
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV",No,5,23.0
"submission_example = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")
test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")
train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")
display(train.head(5))
display(train.describe())
print(""Number of Country_Region: "", train['Country_Region'].nunique())
print(""Dates go from day"", max(train['Date']), ""to day"", min(train['Date']), "", a total of"", train['Date'].nunique(), ""days"")
print(""Countries with Province/State informed: "", train[train['Province_State'].isna()==False]['Country_Region'].unique())'",No,4,45.0
"# Merge train and test, exclude overlap
dates_overlap = ['2020-03-19','2020-03-20','2020-03-21','2020-03-22','2020-03-23', '2020-03-24', '2020-03-25', 
                 '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31']
train2 = train.loc[~train['Date'].isin(dates_overlap)]
all_data = pd.concat([train2, test], axis = 0, sort=False)

# Double check that there are no informed ConfirmedCases and Fatalities after 2020-03-11
all_data.loc[all_data['Date'] >= '2020-03-19', 'ConfirmedCases'] = np.nan
all_data.loc[all_data['Date'] >= '2020-03-19', 'Fatalities'] = np.nan
all_data['Date'] = pd.to_datetime(all_data['Date'])

# Create date columns
le = preprocessing.LabelEncoder()
all_data['Day_num'] = le.fit_transform(all_data.Date)
all_data['Day'] = all_data['Date'].dt.day
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year

# Fill null values given that we merged train-test datasets
all_data['Province_State'].fillna(""None"", inplace=True)
all_data['ConfirmedCases'].fillna(0, inplace=True)
all_data['Fatalities'].fillna(0, inplace=True)
all_data['Id'].fillna(-1, inplace=True)
all_data['ForecastId'].fillna(-1, inplace=True)

display(all_data)
display(all_data.loc[all_data['Date'] == '2020-03-19'])'",Yes,3,8.0
"missings_count = {col:all_data[col].isnull().sum() for col in all_data.columns}
missings = pd.DataFrame.from_dict(missings_count, orient='index')
print(missings.nlargest(30, 0))",No,5,39.0
"def calculate_trend(df, lag_list, column):
    for lag in lag_list:
        trend_column_lag = ""Trend_"" + column + ""_"" + str(lag)
        df[trend_column_lag] = (df[column]-df[column].shift(lag, fill_value=-999))/df[column].shift(lag, fill_value=0)
    return df


def calculate_lag(df, lag_list, column):
    for lag in lag_list:
        column_lag = column + ""_"" + str(lag)
        df[column_lag] = df[column].shift(lag, fill_value=0)
    return df


ts = time.time()
all_data = calculate_lag(all_data, range(1,7), 'ConfirmedCases')
all_data = calculate_lag(all_data, range(1,7), 'Fatalities')
all_data = calculate_trend(all_data, range(1,7), 'ConfirmedCases')
all_data = calculate_trend(all_data, range(1,7), 'Fatalities')
all_data.replace([np.inf, -np.inf], 0, inplace=True)
all_data.fillna(0, inplace=True)
print(""Time spent: "", time.time()-ts)'",No,3,8.0
"all_data[all_data['Country_Region']=='Spain'].iloc[40:50][['Id', 'Province_State', 'Country_Region', 'Date',
       'ConfirmedCases', 'Fatalities', 'ForecastId', 'Day_num', 'ConfirmedCases_1',
       'ConfirmedCases_2', 'ConfirmedCases_3', 'Fatalities_1', 'Fatalities_2',
       'Fatalities_3']]",No,5,14.0
"# Load countries data file
world_population = pd.read_csv(""/kaggle/input/population-by-country-2020/population_by_country_2020.csv"")

# Select desired columns and rename some of them
world_population = world_population[['Country (or dependency)', 'Population (2020)', 'Density (P/Km)', 'Land Area (Km)', 'Med. Age', 'Urban Pop %']]
world_population.columns = ['Country (or dependency)', 'Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop']

# Replace United States by US
world_population.loc[world_population['Country (or dependency)']=='United States', 'Country (or dependency)'] = 'US'

# Remove the % character from Urban Pop values
world_population['Urban Pop'] = world_population['Urban Pop'].str.rstrip('%')

# Replace Urban Pop and Med Age ""N.A"" by their respective modes, then transform to int
world_population.loc[world_population['Urban Pop']=='N.A.', 'Urban Pop'] = int(world_population.loc[world_population['Urban Pop']!='N.A.', 'Urban Pop'].mode()[0])
world_population['Urban Pop'] = world_population['Urban Pop'].astype('int16')
world_population.loc[world_population['Med Age']=='N.A.', 'Med Age'] = int(world_population.loc[world_population['Med Age']!='N.A.', 'Med Age'].mode()[0])
world_population['Med Age'] = world_population['Med Age'].astype('int16')

print(""Cleaned country details dataset"")
display(world_population)

# Now join the dataset to our previous DataFrame and clean missings (not match in left join)- label encode cities
print(""Joined dataset"")
all_data = all_data.merge(world_population, left_on='Country_Region', right_on='Country (or dependency)', how='left')
all_data[['Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop']] = all_data[['Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop']].fillna(0)
display(all_data)

print(""Encoded dataset"")
# Label encode countries and provinces. Save dictionary for exploration purposes
all_data.drop('Country (or dependency)', inplace=True, axis=1)
all_data['Country_Region'] = le.fit_transform(all_data['Country_Region'])
number_c = all_data['Country_Region']
countries = le.inverse_transform(all_data['Country_Region'])
country_dict = dict(zip(countries, number_c)) 
all_data['Province_State'] = le.fit_transform(all_data['Province_State'])
number_p = all_data['Province_State']
province = le.inverse_transform(all_data['Province_State'])
province_dict = dict(zip(province, number_p)) 
display(all_data)'",Yes,1,45.0
"# Filter selected features
data = all_data.copy()
features = ['Id', 'ForecastId', 'Country_Region', 'Province_State', 'ConfirmedCases', 'Fatalities', 
       'Day_num']
data = data[features]

# Apply log transformation to all ConfirmedCases and Fatalities columns, except for trends
data[['ConfirmedCases', 'Fatalities']] = data[['ConfirmedCases', 'Fatalities']].astype('float64')
data[['ConfirmedCases', 'Fatalities']] = data[['ConfirmedCases', 'Fatalities']].apply(lambda x: np.log1p(x))

# Replace infinites
data.replace([np.inf, -np.inf], 0, inplace=True)


# Split data into train/test
def split_data(data):
    
    # Train set
    x_train = data[data.ForecastId == -1].drop(['ConfirmedCases', 'Fatalities'], axis=1)
    y_train_1 = data[data.ForecastId == -1]['ConfirmedCases']
    y_train_2 = data[data.ForecastId == -1]['Fatalities']

    # Test set
    x_test = data[data.ForecastId != -1].drop(['ConfirmedCases', 'Fatalities'], axis=1)

    # Clean Id columns and keep ForecastId as index
    x_train.drop('Id', inplace=True, errors='ignore', axis=1)
    x_train.drop('ForecastId', inplace=True, errors='ignore', axis=1)
    x_test.drop('Id', inplace=True, errors='ignore', axis=1)
    x_test.drop('ForecastId', inplace=True, errors='ignore', axis=1)
    
    return x_train, y_train_1, y_train_2, x_test


# Ridge replace of the Linear regression model
def ridge_reg(X_train, Y_train, X_test):
    # Create Ridge regression object
    #regr = Ridge()        # commit 2
    #regr = RidgeCV(cv=5)  # commit 4
    #regr = Ridge(alpha=10) # commit 5
    regr = Ridge(alpha=10) # now

    # Train the model using the training sets
    regr.fit(X_train, Y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)
    
    return regr, y_pred


# Submission function
def get_submission(s, df, target1, target2):
    
    prediction_1 = df[target1]
    prediction_2 = df[target2]

    # Submit predictions
    prediction_1 = [int(item) for item in list(map(round, prediction_1))]
    prediction_2 = [int(item) for item in list(map(round, prediction_2))]
    
    submission = pd.DataFrame({
        ""ForecastId"": df['ForecastId'].astype('int32'), 
        ""ConfirmedCases"": prediction_1, 
        ""Fatalities"": prediction_2
    })
    submission.to_csv(s + '.csv', index=False)'",Yes,2,7.0
"# Select train (real) data from March 1 to March 22nd
dates_list = ['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', 
                 '2020-03-10', '2020-03-11','2020-03-12','2020-03-13','2020-03-14','2020-03-15','2020-03-16','2020-03-17','2020-03-18',
                 '2020-03-19','2020-03-20','2020-03-21','2020-03-22','2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27', 
                 '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31']",No,4,14.0
all_data.loc[all_data['Country_Region']==country_dict['Spain']][40:65],No,5,14.0
"def plot_rreg_basic_country(data, country_name, dates_list, day_start, shift):
    
    data_country = data[data['Country_Region']==country_dict[country_name]]
    data_country = data_country.loc[data_country['Day_num']>=day_start]
    X_train, Y_train_1, Y_train_2, X_test = split_data(data_country)
    model, pred = ridge_reg(X_train, Y_train_1, X_test)

    # Create a df with both real cases and predictions (predictions starting on March 12th)
    X_train_check = X_train.copy()
    X_train_check['Target'] = Y_train_1

    X_test_check = X_test.copy()
    X_test_check['Target'] = pred

    X_final_check = pd.concat([X_train_check, X_test_check])

    # Select predictions from March 1st to March 25th
    predicted_data = X_final_check.loc[(X_final_check['Day_num'].isin(list(range(day_start, day_start+len(dates_list)))))].Target
    real_data = train.loc[(train['Country_Region']==country_name) & (train['Date'].isin(dates_list))]['ConfirmedCases']
    dates_list_num = list(range(0,len(dates_list)))

    # Plot results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))

    ax1.plot(dates_list_num, np.expm1(predicted_data))
    ax1.plot(dates_list_num, real_data)
    ax1.axvline(17-shift, linewidth=2, ls = ':', color='grey', alpha=0.5)
    ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    ax1.set_xlabel(""Day count (from March "" + str(1+shift) + "" to March 25th)"")
    ax1.set_ylabel(""Confirmed Cases"")

    ax2.plot(dates_list_num, predicted_data)
    ax2.plot(dates_list_num, np.log1p(real_data))
    ax2.axvline(17-shift, linewidth=2, ls = ':', color='grey', alpha=0.5)
    ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    ax2.set_xlabel(""Day count (from March "" + str(1+shift) + "" to March 30th)"")
    ax2.set_ylabel(""Log Confirmed Cases"")

    plt.suptitle((""ConfirmedCases predictions based on Log-Lineal Regression for ""+country_name))
    
    
# Filter Spain, run the Linear Regression workflow
country_name = ""Spain""
march_day = 0
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)'",Yes,3,56.0
"# Filter Spain, run the Linear Regression workflow
country_name = ""Spain""
march_day = 15
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,4,56.0
"# Filter Italy, run the Linear Regression workflow
country_name = ""Italy""
march_day = 0
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0
"# Filter Italy, run the Linear Regression workflow
country_name = ""Italy""
march_day = 15
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,2,56.0
"# Filter Germany, run the Linear Regression workflow
country_name = ""Germany""
march_day = 0
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,56.0
"# Filter Germany, run the Linear Regression workflow
country_name = ""Germany""
march_day = 15
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0
"# Filter Albania, run the Linear Regression workflow
country_name = ""Albania""
march_day = 0
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0
"# Filter Albania, run the Linear Regression workflow
country_name = ""Albania""
march_day = 15
day_start = 39+march_day
dates_list2 = dates_list[march_day:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,56.0
"# Filter Andorra, run the Linear Regression workflow
country_name = ""Andorra""
shift = 0
day_start = 39+shift
dates_list2 = dates_list[shift:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, shift)",No,5,33.0
"# Filter Andorra, run the Linear Regression workflow
country_name = ""Andorra""
shift = 7
day_start = 39+shift
dates_list2 = dates_list[shift:]
plot_rreg_basic_country(data, country_name, dates_list2, day_start, shift)",No,5,33.0
"ts = time.time()

def ridge_reg_basic_all_countries(data, day_start):
    
    data2 = data.loc[data.Day_num >= day_start]

    # Set the dataframe where we will update the predictions
    data_pred = data[data.ForecastId != -1][['Country_Region', 'Province_State', 'Day_num', 'ForecastId']]
    data_pred = data_pred.loc[data_pred['Day_num']>=day_start]
    data_pred['Predicted_ConfirmedCases'] = [0]*len(data_pred)
    data_pred['Predicted_Fatalities'] = [0]*len(data_pred)

    print(""Currently running Logistic Regression for all countries"")

    # Main loop for countries
    for c in data2['Country_Region'].unique():

        # List of provinces
        provinces_list = data2[data2['Country_Region']==c]['Province_State'].unique()

        # If the country has several Province/State informed
        if len(provinces_list)>1:
            for p in provinces_list:
                data_cp = data2[(data2['Country_Region']==c) & (data2['Province_State']==p)]
                X_train, Y_train_1, Y_train_2, X_test = split_data(data_cp)
                model_1, pred_1 = ridge_reg(X_train, Y_train_1, X_test)
                model_2, pred_2 = ridge_reg(X_train, Y_train_2, X_test)
                data_pred.loc[((data_pred['Country_Region']==c) & (data2['Province_State']==p)), 'Predicted_ConfirmedCases'] = pred_1
                data_pred.loc[((data_pred['Country_Region']==c) & (data2['Province_State']==p)), 'Predicted_Fatalities'] = pred_2

        # No Province/State informed
        else:
            data_c = data2[(data2['Country_Region']==c)]
            X_train, Y_train_1, Y_train_2, X_test = split_data(data_c)
            model_1, pred_1 = ridge_reg(X_train, Y_train_1, X_test)
            model_2, pred_2 = ridge_reg(X_train, Y_train_2, X_test)
            data_pred.loc[(data_pred['Country_Region']==c), 'Predicted_ConfirmedCases'] = pred_1
            data_pred.loc[(data_pred['Country_Region']==c), 'Predicted_Fatalities'] = pred_2

    # Apply exponential transf. and clean potential infinites due to final numerical precision
    data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']] = data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']].apply(lambda x: np.expm1(x))
    data_pred.replace([np.inf, -np.inf], 0, inplace=True) 
    
    return data_pred


day_start = 52
data_pred = ridge_reg_basic_all_countries(data, day_start)
get_submission('submission', data_pred, 'Predicted_ConfirmedCases', 'Predicted_Fatalities')

print(""Process finished in "", round(time.time() - ts, 2), "" seconds"")'",Yes,3,48.0
"import numpy as np
import pandas as pd 
import os

sub = pd.read_csv(""../input/submissions/submission.csv"")
sub.to_csv(""submission.csv"", index=False)",No,3,45.0
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
Predictions = pd.read_csv('../input/covid-predictions/COVID_predictions.csv')
Predictions.to_csv('submission.csv',index=False)",No,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
sub = pd.read_csv(""/kaggle/input/inputsubscsv/subs.csv"")
sub.to_csv(""./submission.csv"", index=False)'",No,4,88.0
"# Data Handling
import pandas as pd
import numpy as np
import math
import scipy.stats as sps
#from scipy import stats, integrate
from time import time


# sklearn and models
from sklearn import preprocessing, ensemble, metrics, feature_selection, model_selection, pipeline
import xgboost as xgb

#plotting and display
from IPython.display import display
from matplotlib import pyplot",No,5,22.0
"# create date parser
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

# create data type converters
dtype_map_weather = dict(Station = 'str')
dtype_map_test_train = dict(Block = 'str', Street = 'str')

# read data into PANDAS DataFrames with date parsing
test = pd.read_csv('../input/test.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
train = pd.read_csv('../input/train.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
weather = pd.read_csv('../input/weather.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_weather)
sample_sub = pd.read_csv('../input/sampleSubmission.csv')",No,4,45.0
"print('Train')
display(train.info())

print('Test')
display(test.info())",No,5,40.0
"print('Weather')
display(weather.info())",No,5,40.0
"# weather
weather_exclude = ['Dewpoint', 'WetBulb', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
                 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','DewPoint']
weather_cols = [col for col in weather.columns if col not in weather_exclude]
weather = weather[weather_cols]


# train
train_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'NumMosquitos']
train_cols = [col for col in train.columns if col not in train_exclude]
train = train[train_cols]

# test
test_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'Id']
test_cols = [col for col in test.columns if col not in test_exclude]
test = test[test_cols]",No,4,10.0
weather.info(),No,5,40.0
"print('Weather')
display(weather.head())

print('Train')
display(train.head())",No,5,41.0
"# what species have been detected (note that according to the CDC each
# of these species can carry WNV)
set(train.Species)",No,5,57.0
"# does this correspond to the test set
set(test.Species)
# it looks like there is another category",No,5,57.0
train.groupby('Species').sum().WnvPresent,No,5,60.0
"miss_weather = ['M', '-']
trace_weather = ['T']",No,4,10.0
cols_not_date = [col for col in weather.columns if col != 'Date'],No,5,14.0
"weather[cols_not_date].apply(pd.value_counts, axis=1)[miss_weather + trace_weather].fillna(0).sum()",No,5,17.0
"# Both stations
check.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0
"# Station 1
check_stat1.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0
"# Station 2()
check_stat2.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0
"weather = weather.replace('M', np.NaN)
weather = weather.replace('-', np.NaN)
weather = weather.replace('T', 0.005) # very small amounts of rain can impact mosquito hatches
weather.Tmax = weather.Tmax.fillna(method = 'ffill')
weather.Tmin = weather.Tmin.fillna(method = 'ffill')
weather.Depart = weather.Depart.fillna(method = 'ffill')
weather.Heat = weather.Heat.fillna(method = 'ffill')
weather.Cool = weather.Cool.fillna(method = 'ffill')
weather.PrecipTotal = weather.PrecipTotal.fillna(method = 'ffill')",No,4,17.0
"# convert datatpypes

to_numeric = ['Tmax','Tmin','Tavg', 'Depart', 'Heat', 'Cool', 'PrecipTotal']

for col in to_numeric:
    weather[col]= pd.to_numeric(weather[col])",No,5,16.0
"weather.Sunrise = weather.Sunrise.fillna(method = 'ffill')
weather.Sunset = weather.Sunset.fillna(method = 'ffill')",No,5,17.0
"# sunset has entries where instead of incrementing to the next hour after xx59 it incremented to xx60
# This causes an exception, let's take a look
counter = 0
tracker = []
for index, val in enumerate(weather.Sunset):
    try:
        pd.to_datetime(val, format = '%H%M').time()
    except:
        counter += 1
        tracker.append((index, val, val[2:], counter))

print(tracker[-1])

# there are 48 exceptions",No,4,16.0
"# time conversion lambda function
time_func = lambda x: pd.Timestamp(pd.to_datetime(x, format = '%H%M'))",No,5,16.0
weather.Sunrise = weather.Sunrise.apply(time_func),No,5,8.0
weather.Sunset = weather.Sunset.apply(time_func),No,5,8.0
"# what is the range of values for sunrise and sunset (in hours)
minutes= (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')",No,5,16.0
"#create a DayLength column with minute level precsion
weather['DayLength_MPrec'] = (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60",No,5,16.0
"#create a DayLength column with rounded to the nearest hour
weather['DayLength_NearH'] = np.round(((weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60).values)",No,5,8.0
"# length of night with minute level precision
weather['NightLength_MPrec']= 24.0 - weather.DayLength_MPrec",No,5,8.0
"# lenght of night rounded to nearest hour
weather['NightLength_NearH']= 24.0 - weather.DayLength_NearH",No,5,8.0
"# function to calculate sunset and sunrise times in hours
hours_RiseSet_func = lambda x: x.minute/60.0 + float(x.hour)",No,5,84.0
"# sunrise in hours
weather['Sunrise_hours'] = weather.Sunrise.apply(hours_RiseSet_func)",No,5,8.0
"# sunset in hours
weather['Sunset_hours'] = weather.Sunset.apply(hours_RiseSet_func)",No,5,8.0
"station_blend = pd.DataFrame((station_1.values + station_2.values)/2, columns= blended_cols)",No,5,12.0
"extract_2 = weather[weather.Station == '2'].reset_index(drop = True)
extract_2.head()",No,4,14.0
"extract_1 = weather[weather.Station == '1'].reset_index(drop = True)
extract_1.head()",No,5,14.0
"joined_1 = extract_1.join(station_blend)
joined_2 = extract_2.join(station_blend)",No,5,32.0
"weather_blend = pd.concat([joined_1, joined_2])",No,5,11.0
weather_blend.info(),No,5,40.0
"month_func = lambda x: x.month
day_func= lambda x: x.day
day_of_year_func = lambda x: x.dayofyear
week_of_year_func = lambda x: x.week

# train
train['month'] = train.Date.apply(month_func)
train['day'] = train.Date.apply(day_func)
train['day_of_year'] = train.Date.apply(day_of_year_func)
train['week'] = train.Date.apply(week_of_year_func)

# test
test['month'] = test.Date.apply(month_func)
test['day'] = test.Date.apply(day_func)
test['day_of_year'] = test.Date.apply(day_of_year_func)
test['week'] = test.Date.apply(week_of_year_func)",No,4,8.0
"# remove sunrise and sunset since we have extracted critical information into other fields
weather_blend = weather_blend.drop(['Sunrise', 'Sunset'], axis= 1)",No,5,10.0
"train = train.merge(weather_blend, on='Date')
test = test.merge(weather_blend, on='Date')",No,5,32.0
"weather_blend.ix[:,:12].describe()",No,4,40.0
"weather_blend.ix[:,12:].describe()",No,4,40.0
"# split the data into two dataframes by station

train_station_1= train[train.Station == '1']
train_station_2= train[train.Station == '2']

test_station_1= test[test.Station == '1']
test_station_2= test[test.Station == '2']",No,5,14.0
"# set up a merge for stations 1 and 2
# keep unique cols from station 2
keep_cols = ['Date', u'Tmax', u'Tmin', u'Tavg',u'PrecipTotal']
train_station_2 = train_station_2[keep_cols]
test_station_2 = test_station_2[keep_cols]

# rename cols with prefix
prefix_s2 = 'stat_2_'
rename_cols_s2 = [prefix_s2 + col for col in train_station_2.columns]
train_station_2.columns = rename_cols_s2
test_station_2.columns = rename_cols_s2",No,3,61.0
"# drop cols from station 1 that won't be used in model
drop_cols = ['Heat', 'Cool', 'Depart', 'NightLength_MPrec', 'NightLength_NearH',
            'blended_Depart', 'blended_Heat', 'blended_Cool']

train_station_1 = train_station_1.drop(drop_cols, axis= 1)
test_station_1 = test_station_1.drop(drop_cols, axis= 1)   ",No,5,10.0
"# raname uniqe station 1 columns
prefix_s1 = 'stat_1_'
rename_cols_s1 = [prefix_s1 + col for col in keep_cols]
cols_to_rename= [col for col in train_station_1.columns if col in keep_cols]

# setup name mapping
s1_name_map = dict(zip(cols_to_rename, rename_cols_s1))

train_station_1 = train_station_1.rename(columns= s1_name_map)
test_station_1 = test_station_1.rename(columns= s1_name_map)",No,4,61.0
"# concat (outer join)
train_station_1 =  train_station_1.reset_index(drop= True)
train_station_2 = train_station_2.reset_index(drop = True)
train_merge = pd.concat([train_station_1, train_station_2], axis= 1)

test_station_1 =  test_station_1.reset_index(drop= True)
test_station_2 = test_station_2.reset_index(drop = True)
test_merge = pd.concat([test_station_1, test_station_2], axis= 1)",No,4,11.0
"# remove dates
train_merge = train_merge.drop(['stat_1_Date', 'stat_2_Date'], axis = 1)

test_merge = test_merge.drop(['stat_1_Date', 'stat_2_Date' ], axis = 1)",No,5,10.0
"# add lat and long integer columns

train_merge['Lat_int'] = train_merge.Latitude.astype(int)
train_merge['Long_int'] = train_merge.Longitude.astype(int)

test_merge['Lat_int'] = test_merge.Latitude.astype(int)
test_merge['Long_int'] = test_merge.Longitude.astype(int)",No,5,16.0
"#train_merge= train_merge.drop(['Street', 'Trap', 'Station'], axis= 1)
#test_merge= test_merge.drop(['Street', 'Trap', 'Station'], axis= 1)

train_merge= train_merge.drop('Station', axis= 1)
test_merge= test_merge.drop('Station', axis= 1)",No,5,10.0
len(train_merge.columns),No,5,40.0
len(test_merge.columns),No,5,40.0
"test_merge= test_merge.drop(unique_test_cols, axis= 1)",No,5,10.0
"clf = ensemble.RandomForestClassifier(n_estimators=1000, min_samples_split= 2, random_state= 42)
clf.fit(train_merge, labels)",No,5,7.0
"# create predictions and submission file
predictions_randfor = clf.predict_proba(test_merge)[:,1]",No,5,48.0
"# fit model no training data
xgbc = xgb.XGBClassifier(seed= 42)
xgbc.fit(train_merge, labels)
# feature importance
#print(xgb.feature_importances_)

# plot feature importance
fig, ax = pyplot.subplots(figsize=(10, 15))
xgb.plot_importance(xgbc, ax=ax)
#pyplot.show()",No,4,7.0
xgbc.get_fscore(),No,5,79.0
"# feature importance
xgbc.get_fscore()
#print(xgbc.feature_importances_)",No,5,84.0
"def calc_roc_auc(y, predict_probs):
    
    """"""
    Function accepts labels (matrix y) and predicted probabilities
    Function calculates fpr (false positive rate), tpr (true postivies rate), thresholds and auc (area under
    the roc curve)
    Function returns auc
    """"""
    fpr, tpr, thresholds = metrics.roc_curve(y, predict_probs)
    roc_auc = metrics.auc(fpr, tpr)
    
    return roc_auc",No,5,84.0
"train_split, val_split, label_train_split, label_val_split = model_selection.train_test_split(train_merge, 
                                      labels, test_size = 0.33, random_state = 42, stratify= labels)",No,5,13.0
train_merge.shape,No,5,58.0
"# initialize and fit model
xgb_clf= xgb.XGBClassifier(seed= 42)
xgb_clf.fit(sfm_train, labels)",No,4,7.0
"sfm_test = sfm.transform(test_merge)
predictions_xgb = xgb_clf.predict_proba(sfm_test)[:,1]",No,4,48.0
"X_train= train_split
X_test= val_split
y_train= label_train_split
y_test= label_val_split
model= xgb.XGBClassifier(seed= 42)

eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_metric=""auc"", eval_set=eval_set, verbose=True)",Yes,3,7.0
"results = model.evals_result()
print(results)",No,5,28.0
"model.fit(X_train, y_train, eval_metric=[""auc"", ""logloss"", ""error""], eval_set=eval_set)
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['auc'])
x_axis = range(0, epochs)

# plot auc
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
pyplot.ylabel('AUC')
pyplot.title('XGBoost AUC by Epoch')
pyplot.show()

# plot logloss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Logloss')
pyplot.title('XGBoost Logloss by Epoch')
pyplot.show()

# plot error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Error')
pyplot.title('XGBoost Error by Epoch')
pyplot.show()'",No,4,35.0
"eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric=[""auc""], eval_set=eval_set, early_stopping_rounds=10)
results = model.evals_result()
print(results)",Yes,3,7.0
"# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print(""Model with rank: {0}"".format(i))
            print(""Mean validation score: {0:.3f} (std: {1:.3f})"".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print(""Parameters: {0}"".format(results['params'][candidate]))
            print("""")'",No,4,1.0
"#n_estimators_dist= np.random.randint(1, 500)# number of trees, could use a discrete list or np.random.exponential(scale=0.1, size= 100)
#colsample_bytree_dist= np.random.uniform(0.2,0.6) # should be 0.3 - 0.5
#max_depth_dist = np.random.randint(2, 12) # typical values 3 - 10
#learning_rate_dist= np.random.uniform(0.01, 0.3) # default 0.3, typical values 0.01 - 0.2

#learning_rate_dist= scipy.stats.expon(scale=100)
#learning_rate_dist= 10. ** np.arange(-3, -2)
n_estimators_dist= sps.randint(1, 300)
learning_rate_dist = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]",No,5,5.0
"#cv = model_selection.StratifiedShuffleSplit(n_splits = 10, random_state = 42)  

param_dist = dict(learning_rate= learning_rate_dist, n_estimators= n_estimators_dist) 

# run randomized search
n_iter_search = 20
random_search = model_selection.RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring= 'roc_auc')

start = time()
random_search.fit(X_train, y_train)
print(""RandomizedSearchCV took %.2f seconds for %d candidates""
      "" parameter settings."" % ((time() - start), n_iter_search))
report(random_search.cv_results_)'",Yes,3,6.0
"sample_sub['WnvPresent'] = predictions_xgb
sample_sub.to_csv('sub_xgb.csv', index=False)

#sample_sub['WnvPresent'] = predictions_randfor
#sample_sub.to_csv('sub_randfor.csv', index=False)",No,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from sklearn.metrics import f1_score, log_loss, precision_score, confusion_matrix, classification_report

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,4,88.0
"test_stage_1 = pd.read_csv(""../input/test_stage_1.tsv"", sep=""\\t"")'",No,5,45.0
test_stage_1[0:5],No,5,41.0
"# assigning the GAP dev data as test data
test_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t')
# assigning the GAP test data as train data
train_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t')
valid_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')'",No,5,45.0
"# using the full set of training and validation data
train_df = pd.concat([train_df,valid_df])",No,5,11.0
"def scrape_url(url):
    '''
    get the title of the wikipedia page and replace ""_"" with white space
    '''
    return url[29:].lower().replace(""_"","" "")

def check_name_in_string(name,string):
    '''
    check whether the name string is a substring of another string (i.e. wikipedia title)
    '''

    return name.lower() in string


def predict_coref(df):
    pred =[]
    for index, row in df.iterrows():
        wiki_title = scrape_url(row[""URL""])
        if (check_name_in_string(row[""A""],wiki_title)):
            pred.append(""A"")
        else:
            if (check_name_in_string(row[""B""],wiki_title)):
                pred.append(""B"")
            else:
                pred.append(""NEITHER"")
    return pred

train_pred = predict_coref(train_df)
test_pred = predict_coref(test_df)'",No,5,53.0
"gold_train = []
for index, row in train_df.iterrows():
    if (row[""A-coref""]):
        gold_train.append(""A"") 
    else:
        if (row[""B-coref""]):
            gold_train.append(""B"") 
        else:
            gold_train.append(""NEITHER"")
            
gold_test = []
for index, row in test_df.iterrows():
    if (row[""A-coref""]):
        gold_test.append(""A"") 
    else:
        if (row[""B-coref""]):
            gold_test.append(""B"") 
        else:
            gold_test.append(""NEITHER"")
",No,3,12.0
"
print(f1_score( gold_train, train_pred, average='micro'))
print(classification_report( gold_train, train_pred))
print(confusion_matrix(gold_train, train_pred))",No,5,28.0
"def prec_prob(gold, pred, test):
    '''
    Using the training set to determine the precision by class
    and assigning it to the test data set
    '''
    scores = []
    precision = precision_score(gold, pred,  average=None,
                                labels=['A','B','NEITHER'])
    A_prec = precision[0]
    B_prec = precision[1]
    Neither_prec = precision[2]
    for ante in test:
        if (ante == 'A'):
            scores.append([A_prec, B_prec*B_prior, Neither_prec*Neither_prior])
        else:
            if (ante =='B'):
                scores.append([A_prec*A_prior, B_prec, Neither_prec*Neither_prior])
            else:
                scores.append([A_prec*A_prior,B_prec*B_prior,Neither_prec])
    return scores",No,3,49.0
"
scores_train = prec_prob(gold_train, train_pred, train_pred)
log_loss(gold_train,scores_train)",No,4,49.0
"
scores_test = prec_prob(gold_train, train_pred, test_pred)
log_loss(gold_test,scores_test)",No,4,49.0
"sample_submission = pd.read_csv(""../input/sample_submission_stage_1.csv"")",No,5,45.0
"sample_submission[['A','B','NEITHER']] = scores_test",No,5,55.0
"sample_submission.to_csv(""submission.csv"", index=False)",No,5,25.0
"#Gender classification using universal data set

#import libaries
import numpy as np
import pandas as pd
#ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
#load teh data set
data_set = pd.read_csv(""../input/names-dataset/names_dataset.csv"")
xfeatures  = data_set[""name""]

#feature extraction
cv = CountVectorizer()
X = cv.fit_transform(xfeatures)


data_set.sex.replace({'F':0,'M':1},inplace=True)

#features
X = X
#label
data_set.drop_duplicates(keep=""first"", inplace=True)
y  =data_set.sex
from collections import  Counter
print(""ty"",Counter(y))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)

def Predict(data):
    test_name = [data]
    vector = cv.transform(test_name).toarray()
    result = clf.predict(vector)[0]
    return result

'",Yes,3,7.0
"train_set = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_1.tsv"", encoding=""utf-8"", error_bad_lines=False, delimiter='\\t')


train_set[""A""] = train_set[""A""].apply(Predict)
train_set[""B""] = train_set[""B""].apply(Predict)
'",No,3,45.0
"train_set = train_set[[""ID"", ""A"", 'B'] ]'",No,5,10.0
"
train_set[""NEITHER""] = abs(train_set[""NEITHER""].astype(int))",No,5,16.0
"train_set.to_csv('sub.csv', index=False)",No,5,25.0
"b""import numpy as np\nimport pandas as pd\nimport spacy\nfrom spacy import displacy\nnlp = spacy.load('en_core_web_sm')\nimport nltk\nfrom sklearn import *\n\ntest = pd.read_csv('../input/test_stage_2.tsv', delimiter='\\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})\nsub = pd.read_csv('../input/sample_submission_stage_2.csv')\ntest.shape, sub.shape""",No,3,45.0
"gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t')
#Adding gh_train for stage two submission with new test data, will also add any new data available via Kaggle Competition data for stage2 :)
gh_train = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t')
gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')
train = pd.concat((gh_test, gh_train, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape'",No,4,45.0
"def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s

def get_features(df):
    df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)
    df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
    df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len) 
    df['A-dist_abs'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    df['B-dist_abs'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    df['A-dist'] = (df['Pronoun-offset'] - df['A-offset'])
    df['B-dist'] = (df['Pronoun-offset'] - df['B-offset'])
    df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)
    df['A_max'] = (df['A-offset2'] == df['section_max']).astype(int)
    df['A_min'] = (df['A-offset2'] == df['section_min']).astype(int)
    df['B_max'] = (df['B-offset2'] == df['section_max']).astype(int)
    df['B_min'] = (df['B-offset2'] == df['section_min']).astype(int)
    df['wc'] = df.apply(lambda r: len(str(r['Text'][r['section_min']: r['section_max']]).split(' ')), axis=1)
    #df['Text'] = df.apply(lambda r: r['Text'][: r['Pronoun-offset']] + 'pronountarget' + r['Text'][r['Pronoun-offset'] + len(str(r['Pronoun'])): ], axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1)
    return(df)

train = get_features(train)
test = get_features(test)",Yes,5,8.0
"def get_nlp_features(s, w):
    doc = nlp(str(s))
    tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep'])
    return len(tokens[((tokens['text']==w) & (tokens['dep']=='poss'))])

train['A-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
train['B-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))
test['A-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
test['B-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))",No,4,8.0
"train = train.rename(columns={'A-coref':'A', 'B-coref':'B'})
train['A'] = train['A'].astype(int)
train['B'] = train['B'].astype(int)
train['NEITHER'] = 1.0 - (train['A'] + train['B'])",No,4,16.0
"test_stage_1 = pd.read_csv(""../input/test_stage_1.tsv"", sep=""\\t"")
test_stage_2 = pd.read_csv(""../input/test_stage_2.tsv"", sep=""\\t"")'",No,5,45.0
"gap_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t')
gap_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')'",No,5,45.0
gap_test[0:5],No,5,41.0
"def get_prior(df):
    # count how many times neither antecedent is correct for the pronoun
    Neither_count = len(df) - sum(df[""A-coref""]  |  df[""B-coref""])
    # count the  A coreferences
    A_count = sum(df[""A-coref""])
    # count the B coreferences
    B_count = sum(df[""B-coref""])
    # total number of samples
    test_total = len(df)
    # compute the prior probabilities of the three classes
    Neither_prior = Neither_count/test_total
    A_prior = A_count/test_total
    B_prior = B_count/test_total
    print(""Prior probabilities:"")
    print(""Neither: ""+str(Neither_prior),""A: ""+str(A_prior),""B: ""+str(B_prior))
    # sanity check whether everything adds up
    assert Neither_count + A_count + B_count == test_total
    return A_prior, B_prior, Neither_prior

A_prior,B_prior,Neither_prior = get_prior(gap_test)

",No,5,8.0
"sample_submission = pd.read_csv(""../input/sample_submission_stage_1.csv"")
",No,5,45.0
"def assign_prior(df):
    sub = pd.DataFrame()
    for index, row in df.iterrows():
        sub.loc[index, ""ID""] = row[""ID""]
        sub.loc[index, ""A""] = A_prior
        sub.loc[index, ""B""] = B_prior
        sub.loc[index, ""NEITHER""] = Neither_prior
    return sub",No,5,8.0
"train = assign_prior(gap_test)
valid = assign_prior(gap_valid)",No,5,53.0
"from sklearn.metrics import log_loss

def get_gold(df):
    gold = []
    for index, row in df.iterrows():
        if (row[""A-coref""]):
            gold.append(""A"") 
        else:
            if (row[""B-coref""]):
                gold.append(""B"") 
            else:
                gold.append(""NEITHER"")
    return gold",No,5,53.0
"train_gold = get_gold(gap_test)
valid_gold = get_gold(gap_valid)",No,5,8.0
"train_pred = train[[""A"",""B"",""NEITHER""]]
log_loss(train_gold,train_pred)",No,3,8.0
"valid_pred = valid[[""A"",""B"",""NEITHER""]]
log_loss(valid_gold,valid_pred)",No,3,8.0
sub1 = assign_prior(test_stage_1),No,3,8.0
sub1[0:4],No,5,41.0
"sub1.to_csv(""submission_1.csv"", index=False)",No,5,25.0
"train_female = assign_prior(female_gap_test)
train_male = assign_prior(male_gap_test)
valid_female = assign_prior(female_gap_valid)
valid_male = assign_prior(male_gap_valid)",No,3,8.0
"train_gold_female = get_gold(female_gap_test)
train_gold_male = get_gold(male_gap_test)",No,3,8.0
"train_pred_female = train_female[[""A"",""B"",""NEITHER""]]
log_loss(train_gold_female,train_pred_female)",No,3,8.0
"train_pred_male = train_male[[""A"",""B"",""NEITHER""]]
log_loss(train_gold_male,train_pred_male)",No,3,8.0
"test_df_pop=pd.merge(test_df, country_lookup, how='left', left_on='Country_Region', right_on='Country (or dependency)')",No,5,32.0
test_df_pop.info(),No,5,40.0
"train_df_pop.drop(""Country (or dependency)"", axis=1, inplace=True)",No,5,10.0
"test_df_pop.drop(""Country (or dependency)"", axis=1, inplace=True)",No,5,10.0
%load_ext google.cloud.bigquery,No,5,53.0
"weather_df[""da""]=weather_df[""da""].astype(int)",No,5,16.0
"weather_df['day_from_jan_first'] = weather_df[""da""] + 31*(weather_df[""mo""]=='02') + 60*(weather_df[""mo""]=='03') + 91*(weather_df[""mo""]=='04')  
                              '",No,5,8.0
"train_wk1=pd.read_csv(""../input/training-dataset-from-covid-19-week-1-forecasting/train-3.csv"")
",No,5,45.0
"train_wk1['country+province'] = train_wk1['Country/Region'].fillna('') + '-' + train_wk1['Province/State'].fillna('')
train_df_pop['country+province'] = train_df_pop['Country_Region'].fillna('') + '-' + train_df_pop['Province_State'].fillna('')
test_df_pop['country+province'] = test_df_pop['Country_Region'].fillna('') + '-' + test_df_pop['Province_State'].fillna('')",No,4,17.0
"df = train_wk1.groupby('country+province')[['Lat', 'Long']].mean()",No,5,60.0
"train_df_pop.reset_index(drop=True, inplace=True)",No,5,84.0
"train_df_pop[""Id""]=train_df_pop.index",No,5,8.0
train_df_pop.isnull().sum(),No,5,39.0
labelencoder = LabelEncoder(),No,5,20.0
train_df_pop['Country_Region_Types'] = labelencoder.fit_transform(train_df_pop['Country_Region']),No,5,20.0
test_df_pop['Country_Region_Types'] = labelencoder.fit_transform(test_df_pop['Country_Region']),No,5,20.0
train_df_pop.head(),No,5,41.0
test_df_pop.head(),No,5,41.0
"train_df_pop.rename(columns={""Population (2020)"":""Population""}, inplace=True)",No,5,61.0
"test_df_pop.rename(columns={""Population (2020)"":""Population""}, inplace=True)",No,5,61.0
X_dataset=train_df_pop.copy(),No,5,12.0
"X_dataset=X_dataset[[""Date"",""Population"",""Density"",""day_from_jan_first"",""temp"",""min"",""max"",""stp"",""wdsp"",""prcp"",""fog"",""Country_Region_Types"",""Lat"",""Long""]]",No,5,10.0
"X_dataset[""Date""] = X_dataset[""Date""].apply(lambda x:x.replace(""-"",""""))
X_dataset[""Date""]  = X_dataset[""Date""].astype(int)",No,3,16.0
X_dataset.info(),No,5,40.0
X_dataset.head(),No,5,41.0
"X_dataset[""fog""]  = X_dataset[""fog""].astype(int)
X_dataset[""wdsp""]  = X_dataset[""wdsp""].astype(float)",No,5,16.0
"X_dataset[""Date""].max()",No,5,40.0
"valid_gold_female = get_gold(female_gap_valid)
valid_gold_male = get_gold(male_gap_valid)",No,3,8.0
"valid_pred_female = valid_female[[""A"",""B"",""NEITHER""]]
log_loss(valid_gold_female,valid_pred_female)",No,3,8.0
"valid_pred_male = valid_male[[""A"",""B"",""NEITHER""]]
log_loss(valid_gold_male,valid_pred_male)",No,3,8.0
len(female_test_stage_2),No,4,58.0
len(male_test_stage_2),No,5,58.0
sub2 = assign_prior(test_stage_2),No,3,8.0
sub2.head(),No,5,41.0
"sub2.to_csv(""submission.csv"", index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import spacy
from spacy import displacy
nlp = spacy.load(""en_core_web_sm"")
import nltk
from sklearn import *
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb

import warnings
warnings.filterwarnings(""ignore"")
import time",Yes,4,45.0
"test = pd.read_csv(""../input/test_stage_1.tsv"", delimiter=""\\t"").rename(columns={""A"": ""A_Noun"", ""B"": ""B_Noun""})
sub = pd.read_csv(""../input/sample_submission_stage_1.csv"")
test.shape, sub.shape'",No,4,45.0
"gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t')
gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')
train = pd.concat((gh_test, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape'",Yes,4,45.0
"def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split("" ""):
        s = str(s).replace(r3,r2)
    return s

def get_features(df):
    df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)
    df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
    df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len)                               
    df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1)
    
    df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    return(df)

train = get_features(train)
test = get_features(test)'",Yes,4,8.0
"%%time
def get_nlp_features(s, w):
    doc = nlp(str(s))
    tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep'])
    return len(tokens[((tokens['text']==w) & (tokens['dep']=='poss'))])

train['A-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
train['B-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))
test['A-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
test['B-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))",No,5,8.0
"train = train.rename(columns={""A-coref"": ""A"", ""B-coref"": ""B""})
train[""A""] = train[""A""].astype(int)
train[""B""] = train[""B""].astype(int)
train[""NEITHER""] = 1.0 - (train[""A""] + train[""B""])",No,4,16.0
"col = [""Pronoun-offset"", ""A-offset"", ""B-offset"", ""section_min"", ""Pronoun-offset2"", ""A-offset2"", ""B-offset2"", ""section_max"", ""A-poss"", ""B-poss"", ""A-dist"", ""B-dist""]
x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[[""A"", ""B"", ""NEITHER""]], test_size=0.2, random_state=1)
x1.head()",No,4,13.0
"# set hyper parameters
lgb_params = {""learning_rate"": 0.01,
              ""num_leaves"": 16,
              ""min_data_in_leaf"": 20,
              ""boosting"": ""gbdt"",
              ""num_iterations"": 120,
              ""bagging_fraction"": 0.6,
              ""feature_fraction"": 1.0,
              ""seed"": 42,
              ""num_threads"": -1
              }
""""""
xgb_params = {""eta"": 0.05,
              ""max_depth"": 2,
              ""n_estimators"": 120,
              ""objective"": ""binary:logistic"",
              ""eval_metric"": ""logloss"",
              ""booster"": ""gbtree"",
              ""subsample"": 0.6,
              ""colsample_bytree"": 0.6,
              ""seed"": 42,
              ""n_jobs"": -1
             }
""""""

#model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=33))
#model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**xgb_params))
model = multiclass.OneVsRestClassifier(lgb.LGBMClassifier(**lgb_params))

# 5 fold CV
folds = 5
kf = KFold(n_splits=folds, shuffle=False, random_state=11)
trn = train[col].fillna(-1)
val = train[[""A"", ""B"", ""NEITHER""]]
scores = []
i = 0

for train_index, test_index in kf.split(train):
    x1, x2 = trn.iloc[train_index], trn.iloc[test_index]
    y1, y2 = val.iloc[train_index], val.iloc[test_index]

    model.fit(x1, y1)
    score = metrics.log_loss(y2, model.predict_proba(x2))
    print(str(i+1), ""log-loss:"", score)
    scores.append(score)
    i += 1

print(""CV Score(log-loss):"", np.mean(scores))


model.fit(train[col].fillna(-1), train[[""A"", ""B"", ""NEITHER""]])
results = model.predict_proba(test[col])
test[""A""] = results[:,0]
test[""B""] = results[:,1]
test[""NEITHER""] = results[:,2]
test[[""ID"", ""A"", ""B"", ""NEITHER""]].to_csv(""submission.csv"", index=False)",Yes,3,7.0
"b""test = pd.read_csv('../input/test_stage_1.tsv', delimiter='\\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})\nsub = pd.read_csv('../input/sample_submission_stage_1.csv')\ntest.shape, sub.shape""",No,4,45.0
"# True test here:
#gh_train = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t')

gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t')
gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')
train = pd.concat((gh_test, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape'",No,4,45.0
"def get_coref(row):
    coref = None
    
    nlpr = nlp(row['Text'])
    
    # dunno if more direct way to get token from text offset
    for tok in nlpr.doc:
        if tok.idx == row['Pronoun-offset']:
            # model limitation that sometimes there are no coref clusters for the token?
            # also, sometimes the coref clusters will just be something like:
            # He: his, him, his
            # So there is no proper name to map back to?
            try:
                if len(tok._.coref_clusters) > 0:
                    coref = tok._.coref_clusters[0][0].text
            except:
                # for some, get the following exception just checking len(tok._.coref_clusters)
                # *** TypeError: 'NoneType' object is not iterable
                pass
            break
    
    if coref:
        coref = coref.lower()
        # sometimes the coref is I think meant to be the same as A or B, but
        # it is either a substring or superstring of A or B
        A_Noun = row['A_Noun'].lower()
        B_Noun = row['B_Noun'].lower()
        if coref in A_Noun or A_Noun in coref:
            coref = A_Noun
        elif coref in B_Noun or B_Noun in coref:
            coref = B_Noun
        
    return coref",Yes,2,8.0
"def get_coref_features(df):
    df['Coref'] = df.apply(get_coref, axis=1)
    df['Spacy-Coref-A'] = df['Coref'] == df['A_Noun'].str.lower()
    df['Spacy-Coref-B'] = df['Coref'] == df['B_Noun'].str.lower()
    return df
train = get_coref_features(train)
test = get_coref_features(test)",No,2,8.0
"def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s

def get_features(df):
    df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)
    df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
    df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len)                               
    df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)
    #df['Text'] = df.apply(lambda r: r['Text'][: r['Pronoun-offset']] + 'pronountarget' + r['Text'][r['Pronoun-offset'] + len(str(r['Pronoun'])): ], axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1)
    
    
    df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    return(df)

train = get_features(train)
test = get_features(test)",No,3,8.0
"col = ['Pronoun-offset', 'A-offset', 'B-offset', 'section_min', 'Pronoun-offset2', 'A-offset2', 'B-offset2', 'section_max', 'A-poss', 'B-poss', 'A-dist', 'B-dist', 'Spacy-Coref-A', 'Spacy-Coref-B']
x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[['A', 'B', 'NEITHER']], test_size=0.2, random_state=1)
x1.head()",No,4,13.0
"model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=1000, random_state=33))
# model = multiclass.OneVsRestClassifier(ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100, random_state=33))

# param_dist = {'objective': 'binary:logistic', 'max_depth': 1, 'n_estimators':1000, 'num_round':1000, 'eval_metric': 'logloss'}
# model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**param_dist))

model.fit(x1, y1)
print('log_loss', metrics.log_loss(y2, model.predict_proba(x2)))
model.fit(train[col].fillna(-1), train[['A', 'B', 'NEITHER']])
results = model.predict_proba(test[col])
test['A'] = results[:,0]
test['B'] = results[:,1]
test['NEITHER'] = results[:,2]
test[['ID', 'A', 'B', 'NEITHER']].to_csv('submission.csv', index=False)",No,3,7.0
"import os
import csv
import json
import string
import keras
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from math import floor
import spacy

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb

import time
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
import regex as re",No,5,23.0
nlp = spacy.load('en_core_web_sm'),No,5,30.0
"y1=train_df_pop[train_df_pop[""Date""]<""2020-03-19""][""ConfirmedCases""]
y2=train_df_pop[train_df_pop[""Date""]<""2020-03-19""][""Fatalities""]",No,5,14.0
print(find_name_between_paran(analyze)),No,5,53.0
"#Confirmed Cases
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(X_dataset, y1, test_size = .20, random_state = 42)",No,5,13.0
"results_df = pd.DataFrame({""correct"":correct_name_list})
results_df['prv_obj'] = prev_dobj_list
results_df['pr_ls_sbj'] = prev_subj_list
results_df['pr_1_sbj'] = prev_first_subj_list
results_df['pr_2_sbj'] = prev_second_subj_list
results_df['c1st_sj'] = curr1st_subj_list
results_df['c2nd_sj'] = curr2nd_subj_list
results_df['c1st_ob'] = curr1st_dobj_list
results_df['c2nd_ob'] = curr2nd_dobj_list
results_df['c1st_ap'] = curr1st_appos_list
results_df['w_bt_pa'] = word_btwn_paran_list
results_df['pronoun'] = pronoun_list
results_df['offset'] = pronoun_offset_list'",No,3,55.0
"import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np

datafile = '../input/covid19-global-forecasting-week-2/train.csv'
data = pd.read_csv(datafile)
data['PSCR'] = data.Province_State.map(str)+data.Country_Region.map(str)

# %%
# ip pattern of the empirical data from 2020/03/19 onwards
region = pd.unique(data['PSCR']).tolist()
f_region = []
time_list = []
region_name = []
actual_date = []
no_infection_country = []
for ci in range(len(region)):
    region_data = data[data['PSCR'] == region[ci]]
    region_data = region_data[region_data.ConfirmedCases > 0]
    inc_percentage = (region_data.ConfirmedCases[1:].to_numpy(
    )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy()
    # Only considering the countries with effective data
    if len(np.where(inc_percentage > 0)[0]) > 0:
        inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:]
        actual_date.append(region_data.Date[1:])
        f_region.extend(inc_percentage)
        time_list.extend([i for i in range(len(inc_percentage))])
        region_name.extend([region[ci] for i in range(len(inc_percentage))])
    else:
        no_infection_country.append(region[ci])
f_df = pd.DataFrame(
    {'increase': f_region, 'Day': time_list, 'PSCR': region_name})


# %%
# Simulation data for training
sim_data = []
speed = [0.01,0.1,1]
for batch in range(1,4):
    result = f'../input/simulation-scripts/outfile_s{batch}.npz'
    container = np.load(result)
    speed_batch = f'Sim: speed {speed[batch-1]}'

    sim_result = [container[key] for key in container]
    num_infected = []
    for t in range(len(sim_result)):
        num_infected.append(len(np.where(sim_result[t] < 30)[0]))

    inc_infected = [(num_infected[i+1]-num_infected[i])/num_infected[i]
                    for i in range(len(num_infected)-1)]
    infected_growth_df = pd.DataFrame({'increase': inc_infected, 'Day': [
        i for i in range(len(sim_result)-1)], 'PSCR': speed_batch})
    sim_data.append(infected_growth_df)
sim_df = pd.concat(sim_data)

# %%
criteria_day_length = 10
sim_class_ip = []
for speed in pd.unique(sim_df.PSCR):
    sim_class_ip.append(sim_df[sim_df['PSCR'] == speed].increase.tolist())
sim_class_ip_array = np.array(sim_class_ip)

#%%
labels = []
effective_region = []
for region_loop in region:
    if region_loop not in no_infection_country:
        ip = f_df[f_df['PSCR'] == region_loop].increase[:criteria_day_length].tolist()
        euclidean_dis = np.linalg.norm(np.array(ip)-sim_class_ip_array[:,:len(ip)],axis = 1)
        labels.append(np.where(euclidean_dis == min(euclidean_dis))[0][0])
        effective_region.append(region_loop)
    else:
        pass

xlabels = ['Slow','Moderate','Fast']
scenario_class = {'ip': [xlabels[i] for i in labels], 'Area':effective_region, 'width': [1 for i in range(len(labels))]}
sce_df = pd.DataFrame(scenario_class)
#%%
fig = px.bar(sce_df, x=""ip"", y=""width"", color='Area', height=400)
fig.update_layout(title='Strategies of regions',
                  xaxis_title='Strategy',
                  yaxis_title='Areas and regions',
                  xaxis=dict(
                        showline=True,
                        showgrid=False,
                        showticklabels=True,
                        linecolor='rgb(204, 204, 204)',
                        linewidth=2,
                        ticks='outside',
                        tickfont=dict(
                            family='Arial',
                            size=12,
                            color='rgb(82, 82, 82)',
                        )
                  ),
                  yaxis=dict(
                      showline=True,
                      showgrid=False,
                      showticklabels=True,
                      linecolor='rgb(204, 204, 204)',
                      linewidth=2,
                      ticks='outside',
                      tickfont=dict(
                          family='Arial',
                          size=12,
                          color='rgb(82, 82, 82)',
                      ),
                  ),
                  autosize=True,
                  plot_bgcolor='white',
                  height=600, width=800,
                  )
fig.show()'",Yes,2,45.0
results_df.head(250),No,5,41.0
"dt1=DecisionTreeRegressor(criterion=""friedman_mse"",max_depth=20,random_state=42)",No,5,4.0
"dt1.fit(X_train_confirmed, y_train_confirmed)",No,5,7.0
print(text_list[229]),No,5,41.0
y_pred_dt_confirmed=dt1.predict(X_test_confirmed),No,5,48.0
"np.sqrt(mean_squared_log_error( y_test_confirmed, y_pred_dt_confirmed ))",No,5,49.0
"train = pd.concat([train,pd.get_dummies(train['Country_Region'], prefix='ps')],axis=1)
train.drop(['Country_Region'],axis=1, inplace=True)

train = pd.concat([train,pd.get_dummies(train['Province_State'], prefix='ps')],axis=1)
train.drop(['Province_State'],axis=1, inplace=True)",Yes,3,11.0
"doc = nlp(analyze)
for token in doc:
    print(token.text, token.dep_, token.head.pos_)",No,5,41.0
"#Fatalities
X_train_fatal, X_test_fatal, y_train_fatal, y_test_fatal = train_test_split(X_dataset, y2, test_size = .20, random_state = 42)",No,5,13.0
print(text_list[900:950]),No,5,41.0
"out_df = pd.DataFrame({""ID"":test_ids})",No,5,12.0
"# Using the data on 18 Mar to calculate the tendency of the pandemic.
date_datause = '2020-03-18'
date_actualdata = '2020-03-30'
date_length = (pd.to_datetime(date_actualdata) - pd.to_datetime(date_datause)).days
predict_region_list = []
effect_ind = 0
for it in range(len(region)):
    region_it = region[it]
    if region_it not in no_infection_country:
        time_length_it = actual_date[effect_ind]
        sim_class_it = labels[effect_ind]
        predict_ip_it = sim_class_ip_array[sim_class_it,(len(actual_date[0])-date_length):]
        while len(predict_ip_it)< (date_length+31):
            predict_ip_it = np.append(predict_ip_it,predict_ip_it[len(predict_ip_it)-1])
        retion_df = data[data['PSCR'] == region_it]
        num_infected_it = retion_df[retion_df['Date'] == date_datause]['ConfirmedCases'].astype(float)
        predict_region_list_it = []
        ini_infected = num_infected_it.tolist()[0]
        for predict_day in range(len(predict_ip_it)):
            predict_region_list_it.append(ini_infected * (1+predict_ip_it[predict_day]))
            ini_infected = predict_region_list_it[predict_day]
        predict_region_list.extend(predict_region_list_it)
        effect_ind += 1
    else:
        predict_region_list.extend([0 for i in range(43)])

# %%
# Write output csv file
import csv
from itertools import zip_longest
list1 = [i+1 for i in range(len(predict_region_list))]
list2 = predict_region_list
list3 = [0 for i in range(len(predict_region_list))]
d = [list1, list2,list3]
export_data = zip_longest(*d, fillvalue = '')
with open('submission.csv', 'w', encoding=""ISO-8859-1"", newline='') as myfile:
      wr = csv.writer(myfile)
      wr.writerow((""ForecastId"", ""ConfirmedCases"", ""Fatalities""))
      wr.writerows(export_data)
myfile.close()'",Yes,2,25.0
"# aggregate cases and fatalities
def do_aggregation(df, col, mean_range):
    df_new = copy.deepcopy(df)
    col_new = '{}_({}-{})'.format(col, mean_range[0], mean_range[1])
    df_new[col_new] = 0
    tmp = df_new[col].rolling(mean_range[1]-mean_range[0]+1).mean()
    df_new[col_new][mean_range[0]:] = tmp[:-(mean_range[0])]
    df_new[col_new][pd.isna(df_new[col_new])] = 0
    return df_new[[col_new]].reset_index(drop=True)

def do_aggregations(df):
    df = pd.concat([df, do_aggregation(df, 'cases/day', [1,1]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'cases/day', [1,7]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'cases/day', [8,14]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'cases/day', [15,21]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'fatal/day', [1,1]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'fatal/day', [1,7]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'fatal/day', [8,14]).reset_index(drop=True)], axis=1)
    df = pd.concat([df, do_aggregation(df, 'fatal/day', [15,21]).reset_index(drop=True)], axis=1)
    for threshold in [1, 10, 100]:
        days_under_threshold = (df['ConfirmedCases']<threshold).sum()
        tmp = df['day'].values - 22 - days_under_threshold
        tmp[tmp<=0] = 0
        df['days_since_{}cases'.format(threshold)] = tmp
            
    for threshold in [1, 10, 100]:
        days_under_threshold = (df['Fatalities']<threshold).sum()
        tmp = df['day'].values - 22 - days_under_threshold
        tmp[tmp<=0] = 0
        df['days_since_{}fatal'.format(threshold)] = tmp
    
    # process China/Hubei
    if df['place_id'][0]=='China/Hubei':
        df['days_since_1cases'] += 35 # 2019/12/8
        df['days_since_10cases'] += 35-13 # 2019/12/8-2020/1/2 assume 2019/12/8+13
        df['days_since_100cases'] += 4 # 2020/1/18
        df['days_since_1fatal'] += 13 # 2020/1/9
    return df",No,4,8.0
"dt2=DecisionTreeRegressor(criterion=""friedman_mse"",max_depth=20,random_state=42)",No,5,4.0
"test = pd.concat([test,pd.get_dummies(test['Country_Region'], prefix='ps')],axis=1)
test.drop(['Country_Region'],axis=1, inplace=True)

test = pd.concat([test,pd.get_dummies(test['Province_State'], prefix='ps')],axis=1)
test.drop(['Province_State'],axis=1, inplace=True)",Yes,4,10.0
"from fastai.text import *
from tqdm import tqdm_notebook as tqdm",No,5,22.0
"dt2.fit(X_train_fatal, y_train_fatal)",No,5,7.0
y_pred_dt_fatal=dt2.predict(X_test_fatal),No,5,48.0
"np.sqrt(mean_squared_log_error( y_test_fatal, y_pred_dt_fatal ))",No,5,49.0
"df_traintest3 = []
for place in places[:]:
    df_tmp = df_traintest2[df_traintest2['place_id']==place].reset_index(drop=True)
    df_tmp = do_aggregations(df_tmp)
    df_traintest3.append(df_tmp)
df_traintest3 = pd.concat(df_traintest3).reset_index(drop=True)
df_traintest3[df_traintest3['place_id']=='China/Hubei'].head()",No,3,11.0
"out_df['A'] = results_A
out_df['B'] = results_B
out_df['NEITHER'] = results_N",No,5,55.0
"test_data=test_df_pop[[""Date"",""Population"",""Density"",""day_from_jan_first"",""temp"",""min"",""max"",""stp"",""wdsp"",""prcp"",""fog"",""Country_Region_Types"",""Lat"",""Long""]]",No,5,12.0
"out_df.to_csv(""submission.csv"", index=False)",No,5,25.0
!head -2 ../input/gendered-pronoun-resolution/sample_submission_stage_1.csv,No,5,41.0
test_data.head(),No,5,41.0
"# add Smoking rate per country
# data of smoking rate is obtained from https://ourworldindata.org/smoking
df_smoking = pd.read_csv(""../input/smokingstats/share-of-adults-who-smoke.csv"")
print(np.sort(df_smoking['Entity'].unique())[:10])
df_smoking.head()'",Yes,3,45.0
"import pandas as pd

pd.set_option('max_colwidth', 4000)",No,4,22.0
"test_data[""Date""] = test_data[""Date""].apply(lambda x:x.replace(""-"",""""))
test_data[""Date""]  = test_data[""Date""].astype(int)",No,4,8.0
y_confirmed=dt1.predict(test_data),No,5,48.0
"b""def read_df(path):\n    \n    df = pd.read_csv(path, index_col='ID', sep='\\t', encoding='utf-8')\n\n    # add some columns\n\n    # the ending offset of the pronoun and the candidates referring entities\n    df['Pronoun-offset-end'] = df['Pronoun-offset'] + df['Pronoun'].str.len()\n    df['A-offset-end'] = df['A-offset'] + df['A'].str.len()\n    df['B-offset-end'] = df['B-offset'] + df['B'].str.len()\n\n    # text length\n\n    df['Text-length'] = df['Text'].str.len()\n    \n    return df""",No,3,45.0
"# extract newest data
df_smoking_recent = df_smoking.sort_values('Year', ascending=False).reset_index(drop=True)
df_smoking_recent = df_smoking_recent[df_smoking_recent['Entity'].duplicated()==False]
df_smoking_recent['Country_Region'] = df_smoking_recent['Entity']
df_smoking_recent['SmokingRate'] = df_smoking_recent['Smoking prevalence, total (ages 15+) (% of adults)']
df_smoking_recent.head()",Yes,3,8.0
"df = read_df('../input/gendered-pronoun-resolution/test_stage_1.tsv')

df.sample()",No,4,45.0
"# merge
df_traintest4 = pd.merge(df_traintest3, df_smoking_recent[['Country_Region', 'SmokingRate']], on='Country_Region', how='left')
print(df_traintest4.shape)
df_traintest4.head()",No,4,32.0
len(df),No,5,58.0
y_fatal=dt2.predict(test_data),No,5,48.0
"# fill na with world smoking rate
SmokingRate = df_smoking_recent['SmokingRate'][df_smoking_recent['Entity']=='World'].values[0]
print(""Smoking rate of the world: {:.6f}"".format(SmokingRate))
df_traintest4['SmokingRate'][pd.isna(df_traintest4['SmokingRate'])] = SmokingRate
df_traintest4.head()'",No,4,17.0
"submission=pd.DataFrame({'ForecastId': test_df[""ForecastId""], 'ConfirmedCases': y_confirmed, 'Fatalities': y_fatal})'",No,5,12.0
"# add data from World Economic Outlook Database
# https://www.imf.org/external/pubs/ft/weo/2017/01/weodata/index.aspx
df_weo = pd.read_csv(""../input/smokingstats/WEO.csv"")
df_weo.head()",No,4,45.0
print(df_weo['Subject Descriptor'].unique()),No,5,57.0
"submission[""ConfirmedCases""]=submission[""ConfirmedCases""].astype(int)
submission[""Fatalities""]=submission[""Fatalities""].astype(int)",No,5,16.0
submission.head(),No,5,41.0
"train = train.loc[:,~train.columns.duplicated()]
test = test.loc[:,~test.columns.duplicated()]",No,5,19.0
"y = train.Fatalities
x = train.drop([""Fatalities"", ""ConfirmedCases""], axis = 1)",No,5,21.0
"!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-development.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-test.tsv -q
!wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-validation.tsv -q",No,5,42.0
"data_path = Path(""."")",No,5,77.0
"train = pd.read_csv(data_path/""gap-development.tsv"", sep=""\\t"")
val = pd.read_csv(data_path/""gap-validation.tsv"", sep=""\\t"")
test = pd.read_csv(data_path/""gap-test.tsv"", sep=""\\t"")'",No,5,45.0
"print(len(train), len(val), len(test))",No,5,58.0
"from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline",No,5,23.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))
import zipfile
import sys
import time

# Any results you write to the current directory are saved as output.",No,5,88.0
"from sklearn.model_selection import train_test_split


def linear_models(X_model, y_model, model_name):

    X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, random_state = 42)
    if model_name == ""ridge"":
        model = linear_model.Ridge(alpha = 0.1, random_state = 42).fit(X_train, y_train)
    if model_name == ""lasso"":
        model = linear_model.Lasso(alpha = 0.5, random_state = 42).fit(X_train, y_train)
    if model_name == ""xgb"":
        model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 0.1, n_estimators = 10, random_state = 42).fit(X_train, y_train)
        print(model.get_booster().get_score(importance_type=""gain""))
    if model_name == ""catboost"":
        model = CatBoostRegressor(num_leaves = 31,random_state = 42,learning_rate = 0.05).fit(X_train, y_train)
    predictions = model.predict(X_test)
    return predictions, y_test'",No,3,4.0
"train[""is_valid""] = True
test[""is_valid""] = False
val[""is_valid""] = True

df_pretrain = pd.concat([train, test, val])",No,5,11.0
"db = (TextList.from_df(df_pretrain, data_path/""db"", cols=""Text"").split_from_df(col=""is_valid"").label_for_lm().databunch())",No,5,53.0
vocab = db.vocab,No,5,77.0
"lm = language_model_learner(db, AWD_LSTM, drop_mult=0.5, pretrained=True)",No,5,30.0
lm.unfreeze(),No,5,53.0
lm.lr_find(),No,5,53.0
lm.recorder.plot(),No,5,53.0
"import modeling
import extract_features
import tokenization
import tensorflow as tf",No,5,22.0
"lm.fit_one_cycle(3, 1e-3)",No,5,7.0
"from keras import backend as K
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.engine.topology import Layer",No,5,22.0
"#downloading weights and cofiguration file for the model
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
with zipfile.ZipFile(""uncased_L-12_H-768_A-12.zip"",""r"") as zip_ref:
    zip_ref.extractall()
!ls 'uncased_L-12_H-768_A-12''",No,5,30.0
"#!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/isikkuntay/AIND/master/modeling.py
!wget https://raw.githubusercontent.com/isikkuntay/AIND/master/run_classifier.py
!wget https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py",No,5,42.0
"!wget https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv
!wget https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv
!wget https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv
!ls",No,5,42.0
"def compute_offset_no_spaces(text, offset):
\tcount = 0
\tfor pos in range(offset):
\t\tif text[pos] != "" "": count +=1
\treturn count

def count_chars_no_special(text):
\tcount = 0
\tspecial_char_list = [""#""]
\tfor pos in range(len(text)):
\t\tif text[pos] not in special_char_list: count +=1
\treturn count

def count_length_no_special(text):
\tcount = 0
\tspecial_char_list = [""#"", "" ""]
\tfor pos in range(len(text)):
\t\tif text[pos] not in special_char_list: count +=1
\treturn count'",No,5,53.0
"#Copying Simple NLP notebook over here:

import os
import csv
import json
import string
import keras
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from math import floor
import spacy

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

import time
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
import regex as re

import nltk 
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize 

nlp = spacy.load('en_core_web_sm')

def word_locate(sentence, location): 
    count_words = 0
    count_chars = 2 #2 is to count for the two spaces in the beginning
    for word in sentence.split():
        count_words += 1
        if location == count_chars:
            return word, count_words
        count_chars += len(word)
        count_chars += 1 #for space
        
def name_btwn_paran(sentence):
    capture = """"
    trigger_on = 0
    for char in sentence:
        if char == "")"":
            trigger_on = 0
        if trigger_on == 1:
            capture += char
        if char == ""("":
            trigger_on = 1
    return capture

def which_name_first(sentence, name1, name2): #If name1 is first, return True
    name1_check = 0
    for word_punct in sentence.split():
        for word_comma in word_punct.split("";""):
            for word in word_comma.split("",""):
                if word == name2 and name1_check == 0:
                    return False
                if word == name1:
                    name1_check = 1
    return True

def curr_prev_sentence(sentence, loc):
    current_sentence = """"
    prev_sentence = """"
    trunc_curr_sentence = """"
    remainder_curr = """"
    detect = 0
    count = 0
    for char in sentence:
        count += 1
        current_sentence += char
        remainder_curr += char
        if ((char == ""."" or char == "";"") and detect == 0 and sentence[count] != "",""): #the last arguement to prevent ., as in sent #4
            prev_sentence = current_sentence 
            current_sentence = """"
        if char == ""."" and detect == 1:
            return current_sentence, prev_sentence, trunc_curr_sentence, remainder_curr
        if count == loc:
            detect = 1
            trunc_curr_sentence = current_sentence
            remainder_curr = """"
    return current_sentence, prev_sentence, trunc_curr_sentence, remainder_curr

def remove_last_word(sentence):
    new_sent = sentence.split()
    new_sent = new_sent[:-1]
    return "" "".join(new_sent)

def check_if_capital(word):
    if word[0] in [""A"",""B"",""C"",""D"",""E"",""F"",""G"",""H"",""I"",""J"",""K"",""L"",""M"",""O"",""P"",""Q"",""R"",""S"",""T"",""U"",""V"",""W"",""X"",""Y"",""Z""]:
        return True
    else:
        return False
    
def list_of_name_words(tokenized):
    names_list = []
    for word_tuple in nltk.pos_tag(tokenized):
        if word_tuple[1] == ""NNP"":
            names_list.append(word_tuple[0])
    return names_list

def check_if_name(tokenized,word):
    text = tokenized
    for word_tuple in nltk.pos_tag(text):
        if word_tuple[0] == word:
            if word_tuple[1] == ""NNP"":
                return True
            else:
                return False
            
def find_name_words(sentence):
    name = ""none""
    for word in sentence.split():
        if check_if_capital(word):
            return word
    return name

def remove_first_word(sentence):
    new_sent = sentence.split()
    new_sent = new_sent[1:]
    return "" "".join(new_sent)

def find_nth_subj(doc, n): #finds subject number n
    subject = ""none""
    count = 0
    for token in doc:
        if (token.dep_ == ""nsubj"" or token.dep_ == ""nsubjpass""):
            count += 1
            if count == n:
                subject = token.text
    return subject

def find_nth_dobj(doc, n): #finds direct object number n
    dobj = ""none""
    count = 0
    for token in doc:
        if (token.dep_ == ""dobj""):
            count += 1
            if count == n:
                dobj = token.text
    return dobj

def find_nth_poss(doc, n): #finds possessing noun number n
    poss = ""none""
    count = 0
    for token in doc:
        if (token.dep_ == ""poss""):
            count += 1
            if count == n:
                poss = token.text
    return poss

def find_nth_appos(doc, n): #finds appos number n; sometimes Spacy mislabels nsubj as appos
    appos = ""none""
    count = 0
    for token in doc:
        if (token.dep_ == ""appos""):
            count += 1
            if count == n:
                appos = token.text
    return appos

def check_if_poss_her(doc, pronoun): #tells whether it is her as in his or her as in him
    #assumes only one her in the whole sentence (inaccurate?)
    for token in doc:
        if token.text == pronoun:
            if token.dep_ == ""poss"":
                return True
            else:
                return False
            
if 1 == 1:    
    def get_feature_vector(pronoun, text, A, B, proffset, inquiry_part = ""A""):
        row = {}
        row['A'] = A
        row['B'] = B
        curr, prev, trunc_curr, remainder = curr_prev_sentence(text, proffset)
        curr_doc = nlp(curr)
        prev_doc = nlp(prev) 
        curr_tok = word_tokenize(curr)
        prev_tok = word_tokenize(prev)
        trunc_curr_tok = word_tokenize(trunc_curr)
        train_vector = []
        #get first subj in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_f_s = ""none""
        for n in [1,2,3,4,5]: #number of n is from common sense
            dummy_p_f_s = find_nth_subj(prev_doc,n)
            if check_if_name(prev_tok,dummy_p_f_s) and get_p_f_s == ""none"":
                get_p_f_s = dummy_p_f_s
        
        ####For sentence no. 5, spacy and nltk both failed to identify Collins as a propn.
        ### therefore, we will add a new line here making sure we have a name.
        
        if get_p_f_s == ""none"":
            if check_if_capital(find_nth_subj(prev_doc,1)):
                get_p_f_s = find_nth_subj(prev_doc,1)
        
        ### We are changing the feature conditions in this kernel (1st of fork of simple nlp):
        if get_p_f_s in row[inquiry_part] or row[inquiry_part] in get_p_f_s:
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get last  subj in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_l_s = ""none""
        for n in [1,2,3,4,5]:
            dummy_p_l_s = find_nth_subj(prev_doc,n)
            if check_if_name(prev_tok,dummy_p_l_s):
                get_p_l_s = dummy_p_l_s
               
        ### pls Random forest classifier label special line:
        if get_p_l_s in row[inquiry_part] or row[inquiry_part] in get_p_l_s:
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get first  obj in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_f_o = ""none""
        for n in [1,2,3,4,5]: 
            dummy_p_f_o = find_nth_dobj(prev_doc,n)
            if check_if_name(prev_tok,dummy_p_f_o) and get_p_f_o == ""none"":
                get_p_f_o = dummy_p_f_o
          
        ### pfo Random forest classifier label special line:
        if get_p_f_o in row[inquiry_part] or row[inquiry_part] in get_p_f_o:
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get last  dobj in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_l_o = ""none""
        for n in [1,2,3,4,5]: 
            dummy_p_l_o = find_nth_dobj(prev_doc,n)
            if check_if_name(prev_tok,dummy_p_l_o):
                get_p_l_o = dummy_p_l_o
          
        ### plo Random forest classifier label special line:
        if get_p_l_o in row[inquiry_part] or row[inquiry_part] in get_p_l_o: 
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get last  subj in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_l_s = ""none""
        for n in [1,2,3,4]:
            dummy_tc_l_s = find_nth_subj(curr_doc,n)
            if check_if_name(curr_tok,dummy_tc_l_s)\\
                    and (dummy_tc_l_s in trunc_curr): #this is slightly inaccurate but oh well
                get_tc_l_s = dummy_tc_l_s 
        
        ### tcls Random forest classifier label special line:
        if get_tc_l_s in row[inquiry_part] or row[inquiry_part] in get_tc_l_s: 
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get last  dobj in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_l_o = ""none""
        for n in [1,2,3,4]:
            dummy_tc_l_o = find_nth_dobj(curr_doc,n)
            if (dummy_tc_l_o in trunc_curr)\\
                                        and check_if_name(curr_tok,dummy_tc_l_o): 
                get_tc_l_o = dummy_tc_l_o 
         
        ### tclo Random forest classifier label special line:
        if get_tc_l_o in row[inquiry_part] or row[inquiry_part] in get_tc_l_o: 
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get last  poss in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_l_p = ""none""
        for n in [1,2,3,4]:
            dummy_tc_l_p = find_nth_poss(curr_doc,n)
            if (dummy_tc_l_p in trunc_curr)\\
                                        and check_if_name(curr_tok,dummy_tc_l_p): 
                get_tc_l_p = dummy_tc_l_p 
        
        ### tclp Random forest classifier label special line:
        if get_tc_l_p in row[inquiry_part] or row[inquiry_part] in get_tc_l_p:
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get first subj in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_f_s = ""none""
        for n in [1,2,3,4]:
            dummy_tc_f_s = find_nth_subj(curr_doc,n)
            if check_if_name(curr_tok,dummy_tc_f_s) and get_tc_f_s == ""none"":
                get_tc_f_s = dummy_tc_f_s 
           
        ### tcfs Random forest classifier label special line:
        if get_tc_f_s in row[inquiry_part] or row[inquiry_part] in get_tc_f_s:
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get first dobj in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_f_o = ""none""
        for n in [1,2,3,4]:
            dummy_tc_f_o = find_nth_dobj(curr_doc,n)
            if check_if_name(curr_tok,dummy_tc_f_o) and get_tc_f_o == ""none"": 
                get_tc_f_o = dummy_tc_f_o 
          
        ### tcfo Random forest classifier label special line:
        if get_tc_f_o in row[inquiry_part] or row[inquiry_part] in get_tc_f_o:  
            train_vector.append(1)
        else:
            train_vector.append(0)
    
        #get last  non-subj name word  in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_l_nw = ""none""
        candidate = ""none""
        tc_name_words = list_of_name_words(trunc_curr_tok) 
        if len(tc_name_words) > 0:
            candidate = tc_name_words[-1]
        if candidate in get_tc_f_s or candidate in get_tc_l_s:
            if len(tc_name_words) > 1:
                candidate = tc_name_words[-1]
        if check_if_name(curr_tok,candidate):
            get_tc_l_nw = candidate
        
        ### tclnw Random forest classifier label special line:
        if get_tc_l_nw in row[inquiry_part] or row[inquiry_part] in get_tc_l_nw: 
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get first aposs in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_f_a = ""none""
        for n in [1,2,3,4]:
            dummy_tc_f_a = find_nth_appos(curr_doc,n)
            if check_if_name(curr_tok,dummy_tc_f_a) and get_tc_f_a == ""none"": 
                get_tc_f_a = dummy_tc_f_a 
         
        ### tcfa Random forest classifier label special line:
        if get_tc_f_a in row[inquiry_part] or row[inquiry_part] in get_tc_f_a: 
            train_vector.append(1)
        else:
            train_vector.append(0)
    
        #get word btwn paranthesis in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_f_wp = find_name_words(name_btwn_paran(prev))
          
        ### pfwp Random forest classifier label special line:
        if get_p_f_wp in row[inquiry_part] or row[inquiry_part] in get_p_f_wp:
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get word btwn paranthesis in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_tc_l_wp = find_name_words(name_btwn_paran(curr))  
              
        ### tclwp Random forest classifier label special line:
        if get_tc_l_wp in row[inquiry_part] or row[inquiry_part] in get_tc_l_wp: 
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get last subj in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_r_f_s = ""none""
        for n in [1,2,3,4,5,6,7,8]: #in the final version, each of the name subjects will be accunted for
            dummy_r_f_s = find_nth_subj(curr_doc,n)
            if dummy_r_f_s in remainder and check_if_name(curr_tok,dummy_r_f_s):
                get_r_f_s = dummy_r_f_s 
           
        ### rfs Random forest classifier label special line:
        if get_r_f_s in row[inquiry_part] or row[inquiry_part] in get_r_f_s:
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get last dobj in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_r_f_o = ""none""
        for n in [1,2,3,4,5,6,7,8]: #in the final version, each of the name objects will be accunted for
            dummy_r_f_o = find_nth_dobj(curr_doc,n)
            if dummy_r_f_o in remainder and check_if_name(curr_tok,dummy_r_f_o):
                get_r_f_o = dummy_r_f_o 
              
        ### rfo Random forest classifier label special line:
        if get_r_f_o in row[inquiry_part] or row[inquiry_part] in get_r_f_o:
            train_vector.append(1)
        else:
            train_vector.append(0)
            
        #get last appos in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_r_f_a = ""none""
        for n in [1,2,3,4]:
            dummy_r_f_a = find_nth_appos(curr_doc,n)
            if dummy_r_f_a in remainder and check_if_name(curr_tok,dummy_r_f_a): 
                get_r_f_a = dummy_r_f_a 
          
        ### rfa Random forest classifier label special line:
        if get_r_f_a in row[inquiry_part] or row[inquiry_part] in get_r_f_a:
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get first appos in current @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_c_f_a = ""none""
        for n in [1,2,3,4]:
            dummy_c_f_a = find_nth_appos(curr_doc,n)
            if check_if_name(curr_tok,dummy_c_f_a) and get_c_f_a == ""none"": 
                get_c_f_a = dummy_c_f_a 
               
        ### cfa Random forest classifier label special line:
        if get_c_f_a in row[inquiry_part] or row[inquiry_part] in get_c_f_a: 
            train_vector.append(1)
        else:
            train_vector.append(0)
        
        #get first appos in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@
        get_p_f_a = ""none""
        for n in [1,2,3,4]:
            dummy_p_f_a = find_nth_appos(prev_doc,n)
            if check_if_name(prev_tok,dummy_p_f_a) and get_p_f_a == ""none"": 
                get_p_f_a = dummy_p_f_a 
              
        ### pfa Random forest classifier label special line:
        if get_p_f_a in row[inquiry_part] or row[inquiry_part] in get_p_f_a:
            train_vector.append(1)
        else:
            train_vector.append(0)
    
        #check_if_poss_her
        get_poss_her = check_if_poss_her(curr_doc, pronoun)
        
        #rand_forest classifier for pronoun type:
        if pronoun == ""he"" or pronoun == ""she"": 
            train_vector.append(1)
        elif pronoun == ""He"" or pronoun == ""She"": 
            train_vector.append(2)
        elif pronoun == ""his"" or (pronoun == ""her"" and get_poss_her): 
            train_vector.append(3)
        elif pronoun == ""him"" or (pronoun == ""her"" and not get_poss_her): 
            train_vector.append(4)
        elif pronoun == ""His"" or (pronoun == ""Her"" and get_poss_her): 
            train_vector.append(5)
        else:
            train_vector.append(6)
    
        return train_vector'",No,2,1.0
"def run_bert(data):
\t'''
\tRuns a forward propagation of BERT on input text, extracting contextual word embeddings
\tInput: data, a pandas DataFrame containing the information in one of the GAP files

\tOutput: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768)
\tcolumns: ""emb_A"": the embedding for word A
\t         ""emb_B"": the embedding for word B
\t         ""emb_P"": the embedding for the pronoun
\t         ""label"": the answer to the coreference problem: ""A"", ""B"" or ""NEITHER""
\t'''
    # From the current file, take the text only, and write it in a file which will be passed to BERT
\ttext = data[""Text""]
\ttext.to_csv(""input.txt"", index = False, header = False)

\ttask_name = ""kepler""

#\tprocessors = {""kepler"": run_classifier.KeplerProcessor}
#\tprocessors = {""kepler"": run_classifier.MrpcProcessor}
#processor = processors[""kepler""]

    # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl
    # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers.
\tos.system(""python3 extract_features.py \\
\t  --input_file=input.txt \\
\t  --output_file=output.jsonl \\
\t  --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \\
\t  --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \\
\t  --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \\
\t  --layers=-2 \\
\t  --max_seq_length=256 \\
\t  --batch_size=8"")

\tbert_output = pd.read_json(""output.jsonl"", lines = True)

\tos.system(""rm output.jsonl"")
\tos.system(""rm input.txt"")

\tindex = data.index
\tcolumns = [""emb_A"", ""emb_B"", ""emb_P"", ""feat_A"", ""feat_B"", ""label""]
\temb = pd.DataFrame(index = index, columns = columns)
\temb.index.name = ""ID""

\tfor i in range(len(data)): # For each line in the data file
\t\t# get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
\t\tP = data.loc[i,""Pronoun""]
\t\tA = data.loc[i,""A""]
\t\tB = data.loc[i,""B""]

\t\t# For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
\t\tP_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""Pronoun-offset""])
\t\tA_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""A-offset""])
\t\tB_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""B-offset""])
\t\t# Figure out the length of A, B, not counting spaces or special characters
\t\tA_length = count_length_no_special(A)
\t\tB_length = count_length_no_special(B)

\t\t# Initialize embeddings with zeros
\t\temb_A = np.zeros(768)
\t\temb_B = np.zeros(768)
\t\temb_P = np.zeros(768)

\t\t# Initialize counts
\t\tcount_chars = 0
\t\tcnt_A, cnt_B, cnt_P = 0, 0, 0

\t\tfeatures = pd.DataFrame(bert_output.loc[i,""features""]) # Get the BERT embeddings for the current line in the data file
\t\tfor j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
\t\t\ttoken = features.loc[j,""token""]

\t\t\t# See if the character count until the current token matches the offset of any of the 3 target words
\t\t\tif count_chars  == P_offset: 
\t\t\t\t# print(token)
\t\t\t\temb_P += np.array(features.loc[j,""layers""][0]['values'])
\t\t\t\tcnt_P += 1
\t\t\tif count_chars in range(A_offset, A_offset + A_length): 
\t\t\t\t# print(token)
\t\t\t\temb_A += np.array(features.loc[j,""layers""][0]['values'])
\t\t\t\tcnt_A += 1
\t\t\tif count_chars in range(B_offset, B_offset + B_length): 
\t\t\t\t# print(token)
\t\t\t\temb_B += np.array(features.loc[j,""layers""][0]['values'])
\t\t\t\tcnt_B += 1
# Update the character count
\t\t\tcount_chars += count_length_no_special(token)
\t\t# Taking the average between tokens in the span of A or B, so divide the current value by the count\t
\t\temb_A /= cnt_A
\t\temb_B /= cnt_B

\t\t# Work out the label of the current piece of text
\t\tlabel = ""Neither""
\t\tif (data.loc[i,""A-coref""] == True):
\t\t\tlabel = ""A""
\t\tif (data.loc[i,""B-coref""] == True):
\t\t\tlabel = ""B""

\t\tpro_offset = data.loc[i,""Pronoun-offset""]
\t\tthis_text = data.loc[i,""Text""]

\t\tfeat_A = get_feature_vector(P, this_text, A, B, pro_offset, inquiry_part = ""A"")
\t\tfeat_B = get_feature_vector(P, this_text, A, B, pro_offset, inquiry_part = ""B"")
            
\t\t# Put everything together in emb
\t\temb.iloc[i] = [emb_A, emb_B, emb_P, np.asarray(feat_A), np.asarray(feat_B), label]

\treturn em",No,4,1.0
"from keras import backend, models, layers, initializers, regularizers, constraints, optimizers
from keras import callbacks as kc
from keras import optimizers as ko

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import log_loss
import time


dense_layer_sizes = [37]
dropout_rate = 0.6
learning_rate = 0.001
n_fold = 5
batch_size = 32
epochs = 1000
patience = 100
# n_test = 100
lambd = 0.1 # L2 regularization",No,5,59.0
"class IsLayer(Layer):
    #Layer to be used after a dense one. It will multiply all the elements with each other.
    #In a sense, it allows the neurons to have a say on  each others' outputs. This layer, hopefully,
    #compares the relative importance of neurons.The compound prob is regulated with weights.
    #The idea follows from attention layer, but is more basic than that. As it is multiplicative, it is 
    #an alternative to the vanilla additive layer where outputs are added at the next layer.
    
    def __init__(self, **kwargs):
        super(IsLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        #Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='W', 
                                 shape=(input_shape[1], 1), 
                                 initializer='uniform',
                                 trainable=True)
        super(IsLayer, self).build(input_shape)
        
    def call(self, x):
        x_W = K.dot(x, self.W)
        x_new = x*x_W 
        return x_new
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])",No,5,53.0
"def build_mlp_model(input_shape, num_output):
\tX_input = layers.Input(input_shape)

\t# First dense layer
\tX = layers.Dense(dense_layer_sizes[0], name = 'dense0')(X_input)
\tX = layers.BatchNormalization(name = 'bn0')(X)
\tX = layers.Activation('relu')(X)
\tX = layers.Dropout(dropout_rate, seed = 7)(X)
#\tX = IsLayer()(X)

\t# Second dense layer
# \tX = layers.Dense(dense_layer_sizes[0], name = 'dense1')(X)
# \tX = layers.BatchNormalization(name = 'bn1')(X)
# \tX = layers.Activation('relu')(X)
# \tX = layers.Dropout(dropout_rate, seed = 9)(X)

\t# Output layer
\tX = layers.Dense(num_output, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
\tX = layers.Activation('sigmoid')(X)

\t# Create model
\tmodel = models.Model(input = X_input, output = X, name = ""classif_model"")
\treturn model'",No,5,4.0
"def parse_json(embeddings):
\t'''
\tParses the embeddigns given by BERT, and suitably formats them to be passed to the MLP model

\tInput: embeddings, a DataFrame containing contextual embeddings from BERT, as well as the labels for the classification problem
\tcolumns: ""emb_A"": contextual embedding for the word A
\t         ""emb_B"": contextual embedding for the word B
\t         ""emb_P"": contextual embedding for the pronoun
\t         ""label"": the answer to the coreference problem: ""A"", ""B"" or ""NEITHER""

\tOutput: X, a numpy array containing, for each line in the GAP file, the concatenation of the embeddings of the target words
\t        Y, a numpy array containing, for each line in the GAP file, the one-hot encoded answer to the coreference problem
\t'''
\tembeddings.sort_index(inplace = True) # Sorting the DataFrame, because reading from the json file messed with the order
\tX = np.zeros((len(embeddings)*2,2*768+19)) #19 is the length of special feature vector
\tY = np.zeros((len(embeddings)*2, 1))

\t# Concatenate features (A first batch)
\tfor i in range(len(embeddings)):
\t\tA = np.array(embeddings.loc[i,""emb_A""])
\t\tP = np.array(embeddings.loc[i,""emb_P""])
\t\tF = np.array(embeddings.loc[i,""feat_A""])        
\t\tX[i] = np.concatenate((A, P, F))

\t# One-hot encoding for labels
\tfor i in range(len(embeddings)):
\t\tlabel = embeddings.loc[i,""label""]
\t\tif label == ""A"":
\t\t\tY[i] = 1
\t\telse:
\t\t\tY[i] = 0

\t# Concatenate features (B second batch)
\tfor i in range(len(embeddings)):
\t\tB = np.array(embeddings.loc[i,""emb_B""])
\t\tP = np.array(embeddings.loc[i,""emb_P""])
\t\tF = np.array(embeddings.loc[i,""feat_B""])                
\t\tX[i+len(embeddings)] = np.concatenate((B, P, F)) 

\t# One-hot encoding for labels ; A's and B's concatenated like same since they are symmetrical
\tfor i in range(len(embeddings)):
\t\tlabel = embeddings.loc[i,""label""]
\t\tif label == ""B"":
\t\t\tY[i+len(embeddings)] = 1
\t\telse:
\t\t\tY[i+len(embeddings)] = 0
        
\treturn X, Y'",No,5,53.0
"# Read development embeddigns from json file - this is the output of Bert
development = pd.read_json(""contextual_embeddings_gap_development.json"")
X_development, Y_development = parse_json(development)

validation = pd.read_json(""contextual_embeddings_gap_validation.json"")
X_validation, Y_validation = parse_json(validation)

test = pd.read_json(""contextual_embeddings_gap_test.json"")
X_test, Y_test = parse_json(test)",No,3,1.0
"# There may be a few NaN values, where the offset of a target word is greater than the max_seq_length of BERT.
# They are very few, so I'm just dropping the rows.
remove_test = [row for row in range(len(X_test)) if np.sum(np.isnan(X_test[row]))]
X_train = np.delete(X_test, remove_test, 0)
Y_train = np.delete(Y_test, remove_test, 0)

# We want predictions for all validation rows. So instead of removing rows, make them 0
remove_validation = [row for row in range(len(X_validation)) if np.sum(np.isnan(X_validation[row]))]
X_validation[remove_validation] = np.zeros(2*768+19)

# We want predictions for all development rows. So instead of removing rows, make them 0
remove_development = [row for row in range(len(X_development)) if np.sum(np.isnan(X_development[row]))]
X_development[remove_development] = np.zeros(2*768+19)",No,5,17.0
"# Will train on data from the gap-test and gap-validation files, in total 2454 rows
#X_train = np.concatenate((X_test, X_validation), axis = 0)
#Y_train = np.concatenate((Y_test, Y_validation), axis = 0)

# Will predict probabilities for data from the gap-development file; initializing the predictions
#prediction = np.zeros((len(X_development),1)) # testing predictions

val_prediction = np.zeros((len(X_validation),1)) # valid predictions",No,5,77.0
"# Training and cross-validation
folds = KFold(n_splits=n_fold, shuffle=True, random_state=3)
scores = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
\t# split training and validation data
\tprint('Fold', fold_n, 'started at', time.ctime())
\tX_tr, X_val = X_train[train_index], X_train[valid_index]
\tY_tr, Y_val = Y_train[train_index], Y_train[valid_index]

\t# Define the model, re-initializing for each fold
\tclassif_model = build_mlp_model([X_train.shape[1]],1)
\tclassif_model.compile(optimizer = optimizers.Adam(lr = learning_rate), loss = ""binary_crossentropy"")
\tcallbacks = [kc.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights = True)]

\t# train the model
\tclassif_model.fit(x = X_tr, y = Y_tr, epochs = epochs, batch_size = batch_size, callbacks = callbacks, validation_data = (X_val, Y_val), verbose = 0)

\t# make predictions on validation and test data
\tpred_valid = classif_model.predict(x = X_val, verbose = 0)

\t# oof[valid_index] = pred_valid.reshape(-1,)
\tscores.append(log_loss(Y_val, pred_valid))
    
val_prediction = classif_model.predict(x = X_validation, verbose = 0)

# Print CV scores, as well as score on the test data
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
print(scores)
print(""Test score:"", log_loss(Y_validation,val_prediction))'",No,3,1.0
"def build_neither_mlp(input_shape, num_output):
\tX_input = layers.Input(input_shape)

\t# First dense layer
\tX = layers.Dense(dense_layer_sizes[0], name = 'dense0')(X_input)
\tX = layers.BatchNormalization(name = 'bn0')(X)
\tX = layers.Activation('relu')(X)
\tX = layers.Dropout(dropout_rate, seed = 7)(X)

    # Output layer
\tX = layers.Dense(num_output, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
\tX = layers.Activation('sigmoid')(X)

\t# Create model
\tmodel = models.Model(input = X_input, output = X, name = ""neither_model"")
\treturn model'",No,5,4.0
"X_val_A = val_prediction[: int(len(val_prediction)/2)]
X_val_B = val_prediction[int(len(val_prediction)/2) :]
X_train_neither = np.concatenate((X_val_A, X_val_B), axis=1)

Y_val_A = Y_validation[: int(len(Y_validation)/2)]
Y_val_B = Y_validation[int(len(Y_validation)/2) :]
Y_train_neither = 1 - Y_val_A - Y_val_B",No,5,21.0
"print(X_val_A.shape)
print(X_val_B.shape)
print(X_train_neither.shape)
print(Y_val_A.shape)
print(Y_val_B.shape)
print(Y_train_neither.shape)",No,5,58.0
"neither_model = build_neither_mlp([X_train_neither.shape[1]],1)
neither_model.compile(optimizer = optimizers.Adam(lr = learning_rate), loss = ""binary_crossentropy"")
neither_model.fit(X_train_neither, y = Y_train_neither, epochs = epochs, batch_size = batch_size, validation_data = (X_train_neither, Y_train_neither), verbose = 0)

dev_prediction = classif_model.predict(x = X_development, verbose = 0)",No,3,7.0
"X_dev_A = dev_prediction[: int(len(dev_prediction)/2)]
X_dev_B = dev_prediction[int(len(dev_prediction)/2) :]
X_dev_neither = np.concatenate((X_dev_A, X_dev_B), axis=1)",No,5,13.0
"dev_neither = neither_model.predict(x = X_dev_neither, verbose = 0)",No,5,27.0
"# Write the prediction to file for submission
submission = pd.read_csv(""../input/sample_submission_stage_1.csv"", index_col = ""ID"")

submission[""A""] = X_dev_A
submission[""B""] = X_dev_B
submission[""NEITHER""] = dev_neither
submission.to_csv(""submission_bert.csv"")",No,4,25.0
"import pandas as pd
import numpy as np
import keras

import spacy

from collections import defaultdict
from sklearn.metrics import log_loss


import matplotlib.pyplot as plt
%matplotlib inline

import time
import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0
nlp = spacy.load('en_core_web_lg'),No,5,30.0
"spacy_tok = SpacyTokenizer(""en"")
tokenizer = Tokenizer(spacy_tok)",No,5,84.0
"df_pretrain.Text.apply(lambda x: len(tokenizer.process_text(x, spacy_tok))).describe()",No,5,40.0
"class Graph():
    def __init__(self):
        """"""
        self.edges is a dict of all possible next nodes
        e.g. {'X': ['A', 'B', 'C', 'E'], ...}
        self.weights has all the weights between two nodes,
        with the two nodes as a tuple as the key
        e.g. {('X', 'A'): 7, ('X', 'B'): 2, ...}
        """"""
        self.edges = defaultdict(list)
        self.weights = {}
    
    def add_edge(self, from_node, to_node, weight, back_penalty=1):
        # Note: assumes edges are bi-directional
        self.edges[from_node].append(to_node)
        self.edges[to_node].append(from_node)
        self.weights[(from_node, to_node)] = weight
        self.weights[(to_node, from_node)] = weight*back_penalty

def dijsktra(graph, initial, end):
    # shortest paths is a dict of nodes
    # whose value is a tuple of (previous node, weight)
    shortest_paths = {initial: (None, 0)}
    current_node = initial
    visited = set()
    
    while current_node != end:
        visited.add(current_node)
        destinations = graph.edges[current_node]
        weight_to_current_node = shortest_paths[current_node][1]

        for next_node in destinations:
            weight = graph.weights[(current_node, next_node)] + weight_to_current_node
            if next_node not in shortest_paths:
                shortest_paths[next_node] = (current_node, weight)
            else:
                current_shortest_weight = shortest_paths[next_node][1]
                if current_shortest_weight > weight:
                    shortest_paths[next_node] = (current_node, weight)
        
        next_destinations = {node: shortest_paths[node] for node in shortest_paths if node not in visited}
        if not next_destinations:
            raise Exception(""Something is wrong"")
        # next node is the destination with the lowest weight
        current_node = min(next_destinations, key=lambda k: next_destinations[k][1])
    
    # Work back through destinations in shortest path
    path = []
    dist = 0
    while current_node is not None:
        path.append(current_node)
        next_node = shortest_paths[current_node][0]
        dist += shortest_paths[current_node][1]
        current_node = next_node
    # Reverse path
    path = path[::-1]
    return path, dist
        '",No,5,53.0
"def get_rank(token):
    """"""Step up with token.head until it reaches the root. Returns with step number and root""""""
    i = 0
    next_token = token
    while(next_token!=next_token.head):
        i+=1
        next_token=next_token.head
    return i, next_token

def child_count(token):
    cc = 0
    for child in token.children:
        cc+=1
    return cc",No,5,53.0
"def build_answers(data):
    answers = []
    for i in range(len(data)):
        dataNext = data.loc[i]
        Acoref = dataNext[""A-coref""]
        Bcoref = dataNext[""B-coref""]
        answerNext = [int(Acoref), int(Bcoref), 1-int(Acoref or Bcoref)]
        answers.append(answerNext)
    return np.vstack(answers)",No,5,53.0
"def build_features(data):
    """"""Generates features from input data""""""
    features = []
    sum_good = 0
    for i in range(0,len(data)):
        fi = []
        dataNext = data.loc[i]
        text = dataNext[""Text""]
        #print(visualise(dataNext))
        doc=nlp(text)
        Aoff = dataNext[""A-offset""]
        Boff = dataNext[""B-offset""]
        Poff = dataNext[""Pronoun-offset""]
        lth = len(text)
        
        for token in doc:
            if(token.idx==Aoff):
                Atoken = token
            if(token.idx==Boff):
                Btoken = token
            if(token.idx==Poff):
                Ptoken=token
        Arank, Aroot = get_rank(Atoken)
        Brank, Broot = get_rank(Btoken)
        Prank, Proot = get_rank(Ptoken)
        
        graph = Graph()
        
        for token in doc:
            graph.add_edge(token, token.head, 1, 4)
        
        sent_root = []
        for sent in doc.sents:
            sent_root.append(sent.root)
        for j in range(len(sent_root)-1):
            graph.add_edge(sent_root[j], sent_root[j+1],1, 4)
        try:
            _, Alen = dijsktra(graph, Atoken, Ptoken)
        except:
            Alen = 300
        try:
            _, Blen = dijsktra(graph, Btoken, Ptoken)
        except:
            Blen = 300
        
        
        sent_num = len(sent_root)
        for i in range(len(sent_root)):
            if Aroot == sent_root[i]:
                Atop = i
            if Broot == sent_root[i]:
                Btop = i
            if Proot == sent_root[i]:
                Ptop = i
        
        fi.append(Aoff/lth)#0
        fi.append(Boff/lth)#1
        fi.append(Poff/lth)#2

        fi.append(1.0*Atop/sent_num)#3
        fi.append(1.0*Btop/sent_num)#4
        fi.append(1.0*Ptop/sent_num)#5

        fi.append(Arank/10)#6
        fi.append(Brank/10)#7
        fi.append(Prank/10)#8
        
        #fi.append(Atoken.similarity(Ptoken))#9
        #fi.append(Btoken.similarity(Ptoken))#10
        
        #fi.append(Alen/300)#9
        #fi.append(Blen/300)#10
        
        #fi.append(child_count(Aroot))#11
        #fi.append(child_count(Broot))#12
        #fi.append(child_count(Proot))#13
        
        features.append(fi)
    return np.vstack(features)

def swap_raws(data, i, j):
    """"""Swap the ith and jth column of the data""""""
    new_data = np.copy(data)
    temp = np.copy(new_data[:, i])
    new_data[:,i] = new_data[:,j]
    new_data[:,j] = temp
    return new_data",No,5,53.0
"!pip install pytorch-pretrained-bert
!pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.0.4.zip",No,3,22.0
"import os

# This variable is used by helperbot to make the training deterministic
os.environ[""SEED""] = ""420""

import logging
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

from helperbot import BaseBot, TriangularLR, WeightDecayOptimizerWrapper
",No,5,23.0
"BERT_MODEL = 'bert-large-uncased'
CASED = False",No,5,77.0
"df_train = pd.read_csv(""gap-test.tsv"", delimiter=""\\t"")
df_val = pd.read_csv(""gap-validation.tsv"", delimiter=""\\t"")
df_test = pd.read_csv(""gap-development.tsv"", delimiter=""\\t"")
sample_sub = pd.read_csv(""../input/sample_submission_stage_1.csv"")
assert sample_sub.shape[0] == df_test.shape[0]'",No,5,45.0
"tokenizer = BertTokenizer.from_pretrained(
    BERT_MODEL,
    do_lower_case=CASED,
    never_split = (""[UNK]"", ""[SEP]"", ""[PAD]"", ""[CLS]"", ""[MASK]"", ""[A]"", ""[B]"", ""[P]"")
)
# These tokens are not actually used, so we can assign arbitrary values.
tokenizer.vocab[""[A]""] = -1
tokenizer.vocab[""[B]""] = -1
tokenizer.vocab[""[P]""] = -1",No,4,30.0
"sample = df.sample(random_state=100)

display_entry(sample.iloc[0])",No,4,41.0
"import os

# This variable is used by helperbot to make the training deterministic
os.environ[""SEED""] = ""323""

import logging
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader, TensorDataset
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel
from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor

from helperbot import (
    TriangularLR, BaseBot, WeightDecayOptimizerWrapper
)",No,5,23.0
"result = pd.DataFrame({'ID': df.index, 'A': 1, 'B': 1, 'NEITHER': 1})

result.to_csv('dummy_all_equal.csv', index=False)",No,4,25.0
"result['A'] = 1
result['B'] = 0
result['NEITHER'] = 0

result.to_csv('dummy_A.csv', index=False)",No,4,25.0
"result['A'] = 0
result['B'] = 1
result['NEITHER'] = 0

result.to_csv('dummy_B.csv', index=False)",No,5,25.0
"result['A'] = 0
result['B'] = 0
result['NEITHER'] = 1

result.to_csv('dummy_NEITHER.csv', index=False)",No,5,25.0
"# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
from sklearn.metrics import log_loss

y_true = [""spam"", ""ham"", ""ham"", ""spam""]
# The labels in y_pred are assumed to be ordered alphabetically, as done by preprocessing.LabelBinarizer
# [""ham"", ""spam""]
y_pred = [
    [.1, .9], 
    [.9, .1], 
    [.8, .2], 
    [.35, .65]
]

log_loss(y_true, y_pred)",No,4,49.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import re
print(os.listdir(""../input""))

import spacy
import networkx as nx

import zipfile

sample_submission = pd.read_csv(""../input/gendered-pronoun-resolution/sample_submission_stage_1.csv"")
final_test = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_2.tsv"", sep = ""\\t"")
nlp = spacy.load('en_core_web_sm')
dep = [""ACL"", ""ACOMP"", ""ADVCL"", ""ADVMOD"", ""AGENT"", ""AMOD"", ""APPOS"", ""ATTR"", ""AUX"", ""AUXPASS"",
       ""CASE"", ""CC"", ""CCOMP"", ""COMPOUND"", ""CONJ"", ""CSUBJ"", ""CSUBJPASS"", ""DATIVE"", ""DEP"", ""DET"", ""DOBJ""
     , ""EXPL"", ""INTJ"", ""MARK"", ""META"", ""NEG"", ""NOUNMOD"", ""NPMOD"", ""NSUBJ"", ""NSUBJPASS"", ""NUMMOD""
     , ""OPRD"", ""PARATAXIS"", ""PCOMP"", ""POBJ"", ""POSS"", ""PRECONJ"", ""PREDET"", ""PREP"", ""PRT"", ""PUNCT"", ""QUANTMOD"",
       ""RELCL"", ""ROOT"", ""XCOMP"", ""COMPLM"",""INFMOD"",""PARTMOD"",""HMOD"",""HYPH"",""IOBJ"",""NUM"",
       ""NUMBER"",""NMOD"",""NN"",""NPADVMOD"",""POSSESSIVE"",""RCMOD"",""SUBTOK""]

# Any results you write to the current directory are saved as output.",No,4,22.0
final_test.shape,No,5,58.0
import tensorflow as tf,No,5,22.0
"train_data = pd.read_csv(""gap-development.tsv"", sep = ""\\t"")
validation_data = pd.read_csv(""gap-validation.tsv"", sep = ""\\t"")
test_data = pd.read_csv(""gap-test.tsv"", sep = ""\\t"")'",No,5,45.0
"
merge_data = pd.concat([train_data,validation_data]).reset_index(drop = True)
merge_data = pd.concat([merge_data,train_data]).reset_index(drop = True)
count = 0",No,5,11.0
"

def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s
def shortest_dependency_path(doc, e1=None, e2=None):
    
    edges = []
    for token in doc:
        for child in token.children:
            edges.append(('{0}'.format(token),
                          '{0}'.format(child)))
    graph = nx.Graph(edges)
    try:
        shortest_path = nx.shortest_path(graph, source=e1, target=e2)
    except Exception as e:
        shortest_path = [e1, e2]
        print(e)
        print(doc, e1, e2)

    return shortest_path

def dependency_vector(doc, pronoun, word):
    
    vector = [0] * 59
#     for token in doc:
#         if token.text == pronoun:
#             pi = token.i
#         elif token.text == word:
#             wi = token.i
#     if pi>wi:
#         for token in doc[wi:pi+1]:
#             index = dep.index(token.dep_.upper())
#             vector[index] = 1

#     else:
#         for token in doc[pi:wi+1]:
#             index = dep.index(token.dep_.upper())
#             vector[index] = 1
                
#     return vector
         
    x = shortest_dependency_path(doc, pronoun, word)
    for token in doc:
        if token.text in x:
            val = (x.index(str(token)) + 1) / len(x)
            try:
                index = dep.index(token.dep_.upper())
                vector[index] = val
            except:
                pass
    return vector
def get_features(df):
    
    df['A-offset2'] = df['A-offset'] + df['A'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B'].map(len)
    df[""Text""] =  df.apply(lambda row: name_replace(row[""Text""], row[""A""], ""Noun_1""), axis = 1)
    df[""Text""] =  df.apply(lambda row: name_replace(row[""Text""], row[""B""], ""Noun_2""), axis = 1)
    new_df = pd.DataFrame([])
    new_df[""Pronoun-offset""] = df[""Pronoun-offset""]
    new_df['A-offset'] = df[""A-offset""]
    new_df[""B-offset""] = df[""B-offset""]
    new_df['A-offset2'] = df['A-offset2']
    new_df['B-offset2'] = df['B-offset2']
    new_df['A_dist'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    new_df['B_dist'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    df[""Text""] = df.Text.apply(lambda row: "" and "".join(row.split("". "")))
    vectors_A = df.apply(lambda row: dependency_vector(nlp(row[""Text""]), row[""Pronoun""],""Noun_1"") + dependency_vector(nlp(row[""Text""]), row[""Pronoun""],""Noun_2""), axis = 1)
    print(count)
    new_df_2 = pd.DataFrame(vectors_A.tolist())
    new_df = pd.concat([new_df, new_df_2], axis = 1)    
    return new_df
    
    '",Yes,2,53.0
"feature = get_features(merge_data)


",No,5,8.0
feature,No,5,53.0
"
Y = merge_data[[""A-coref"", ""B-coref""]]
Y.columns = [""A"",""B""]
Y[""A""] = Y[""A""].astype(int)
Y[""B""] = Y[""B""].astype(int)
Y[""NEITHER""] = 1- (Y[""A""] + Y[""B""])",No,4,16.0
"from sklearn import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
x1, x2, y1, y2 = model_selection.train_test_split(feature.fillna(-1), Y, test_size=0.2, random_state=1)
x1.head()
x2.head()
y2",No,4,22.0
"scaler = StandardScaler()
x1 = scaler.fit_transform(x1)
x2 = scaler.transform(x2)
model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=1000, random_state=33))
# model = multiclass.OneVsRestClassifier(ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100, random_state=33))

# param_dist = {'objective': 'binary:logistic', 'max_depth': 1, 'n_estimators':1000, 'num_round':1000, 'eval_metric': 'logloss'}
# model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**param_dist))

model.fit(x1, y1)
print('log_loss', metrics.log_loss(y2, model.predict_proba(x2)))",No,3,18.0
"final_test = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_2.tsv"", sep = ""\\t"")
feature = get_features(final_test)
print(feature)

'",No,3,45.0
"
feature = feature.fillna(-1)
# feature = scaler.transform(feature)
print(feature)
",No,4,17.0
"
Y = pd.DataFrame(model.predict_proba(feature).tolist(), columns=[""A"",""B"", ""NEITHER""])
r = final_test[[""ID""]]
submission = pd.concat([r,Y], axis = 1)",No,3,12.0
"print(submission)
submission.to_csv('submission.csv', index=False)",No,4,25.0
!ls ../input,No,5,88.0
"import time
import os
import random

import numpy as np
import pandas as pd

import torch
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.nn import Module, Linear, Dropout
import torch.nn.functional as F
from pytorch_pretrained_bert.modeling import BertModel, BertLayer
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.optimization import warmup_linear
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler

from sklearn.metrics import log_loss
import matplotlib.pyplot as plt",No,5,22.0
"# Model
bert_model = ""bert-large-cased""
n_bertlayers = 22
dropout = 0.1

# Preprocessing
do_lower_case = False

# Training
train_batch_size = 4
gradient_accumulation_steps = 5
lr = 1e-5
num_train_epochs = 2
warmup_proportion = 0.1
optim = ""bertadam""
weight_decay = False


# Others
n_models = 10
eval_batch_size = 32

device = torch.device(""cuda"")
data_dir = """"",No,4,59.0
"mf = '../input/fork-of-fork-of-densenet201/model.hdf5'
import os
os.system('ls '+mf)",No,5,88.0
"plt.figure(figsize=(16,8))
for i,country in enumerate(list_countries):
    Fatal_diff=Fatal_pivot[(Fatal_pivot[country]>0)][country].diff().fillna(0)
    Fatal_diff=Fatal_diff[Fatal_diff>0]
    Fatal_diff.plot(color=colors[i],label=country,lw=5)
    plt.title('Number of daily new Fatalities',fontsize=15)
    plt.legend(title='country')
plt.tight_layout()",No,5,33.0
"# all prediction is correct
y_pred = [
    [0., 1.],
    [1., 0.],
    [1., 0.],
    [0., 1.]
]

log_loss(y_true, y_pred)",No,5,84.0
"# all prediction is wrong
y_pred = [
    [1., 0.],
    [0., 1.],
    [0., 1.],
    [1., 0.]
]

log_loss(y_true, y_pred)",No,5,49.0
"samples = df.sample(n=10, random_state=100)

for _, s in samples.iterrows():
    
    display_entry(s)",No,4,41.0
"import spacy

nlp = spacy.load('en_core_web_lg')

displacy.render(nlp(samples.iloc[-2]['Text']), style='dep', jupyter=True, options={'distance': 150})",No,4,22.0
"# following: https://www.kaggle.com/keyit92/end2end-coref-resolution-by-attention-rnn/data

train_df = read_df(""../input/googlegapcoreference/gap-test.tsv"")
test_df = read_df(""../input/googlegapcoreference/gap-development.tsv"")
dev_df = read_df(""../input/googlegapcoreference/gap-validation.tsv"")",No,5,45.0
"print(f""Train: {train_df.shape}\
Test: {test_df.shape}\
Development: {dev_df.shape}"")'",No,5,58.0
"sample = train_df.sample(random_state=555)

display_entry(sample.iloc[0])",No,5,41.0
"# just testing if there is any entry with more than one answer

train_df[train_df[['A-coref', 'B-coref']].sum(axis=1) > 1]",No,4,41.0
"# adding a column with the answer
def get_answer(row):
    
    if row['A-coref']:
        return 'A'
    
    if row['B-coref']:
        return 'B'
    
    return 'NEITHER'
    
train_df['answer'] = train_df.apply(get_answer, axis=1)",No,5,8.0
train_df['Text-length'].hist(),No,5,33.0
"train_df.groupby(pd.qcut(train_df['Text-length'], q=[0, .25, .5, .75, 1.]))['answer'].value_counts().unstack()",No,5,60.0
"train_df_A = train_df[train_df['answer'] == 'A']
train_df_B = train_df[train_df['answer'] == 'B']
train_df_NEITHER = train_df[train_df['answer'] == 'NEITHER']

X_A_A = train_df_A.rename(columns={
    'A': 'RE',
    'A-offset': 'RE-offset',
    'A-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length']]

X_A_A['y'] = 1
X_A_A['referred-expression'] = 'A'

X_A_B = train_df_A.rename(columns={
    'B': 'RE',
    'B-offset': 'RE-offset',
    'B-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']]

X_A_B['y'] = 0
X_A_B['referred-expression'] = 'A'

X_B_B = train_df_B.rename(columns={
    'B': 'RE',
    'B-offset': 'RE-offset',
    'B-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']]

X_B_B['y'] = 1
X_B_B['referred-expression'] = 'B'

X_B_A = train_df_B.rename(columns={
    'A': 'RE',
    'A-offset': 'RE-offset',
    'A-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']]

X_B_A['y'] = 0
X_B_A['referred-expression'] = 'B'

X_NEITHER_A = train_df_NEITHER.rename(columns={
    'A': 'RE',
    'A-offset': 'RE-offset',
    'A-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']]

X_NEITHER_A['y'] = 0
X_NEITHER_A['referred-expression'] = 'A'

X_NEITHER_B = train_df_NEITHER.rename(columns={
    'B': 'RE',
    'B-offset': 'RE-offset',
    'B-offset-end': 'RE-offset-end'
})[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']]

X_NEITHER_B['y'] = 0
X_NEITHER_B['referred-expression'] = 'B'


X_df = pd.concat((X_A_A, X_A_B, X_B_A, X_B_B, X_NEITHER_A, X_NEITHER_B))",Yes,3,61.0
X_df.shape,No,5,58.0
X_df.sample(random_state=1),No,5,41.0
"from textacy import similarity

def add_features(df, re_col, inplace=False):
    
    if inplace:
        df_ = df
    else:
        df_ = df.copy()
    
    df_['URL_last_part'] = df_['URL'].str.rsplit('/', n=1, expand=True)[1].apply(preprocess_so)
    
    df_['URL_distance_jaro_winkler'] = df_.apply(lambda row: similarity.jaro_winkler(row['URL_last_part'], row[re_col]), axis=1)
    df_['URL_distance_levenshtein'] = df_.apply(lambda row: similarity.levenshtein(row['URL_last_part'], row[re_col]), axis=1)
    df_['URL_distance_token_sort_ratio'] = df_.apply(lambda row: similarity.token_sort_ratio(row['URL_last_part'], row[re_col]), axis=1)
    
    return df_
    

add_features(X_df, 'RE', inplace=True)

X_df.sample(5, random_state=800)[['URL_last_part', 'URL']]",No,4,8.0
"X_df.hist(column='URL_distance_jaro_winkler', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0
"X_df.hist(column='URL_distance_levenshtein', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0
"X_df.hist(column='URL_distance_token_sort_ratio', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0
"from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = X_df[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length', 'referred-expression']]
y = X_df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

X_train_features = X_train.drop(columns='referred-expression')
X_train_referred_expression = X_train['referred-expression']

X_test_features = X_test.drop(columns='referred-expression')
X_test_referred_expression = X_test['referred-expression']

lr = LinearRegression(normalize=True).fit(X_train_features, y_train)

y_pred_ = lr.predict(X_test_features)",Yes,4,21.0
"import numpy as np

def transform_to_submit(y_pred_, referred_expression):
    
    y_pred_comp = 1 - y_pred_
    all_zero = np.zeros_like(y_pred_).reshape((-1, 1))

    y_pred = np.hstack((
                np.where(referred_expression == 'A', y_pred_, y_pred_comp).reshape((-1, 1)),
                np.where(referred_expression == 'B', y_pred_comp, y_pred_).reshape((-1, 1)),
                all_zero
    ))
    
    return y_pred

y_true = np.hstack((
            np.where(((X_test_referred_expression == 'A') & (y_test)), 1, 0).reshape((-1, 1)),
            np.where(((X_test_referred_expression == 'B') & (y_test)), 1, 0).reshape((-1, 1)),
            np.zeros_like(y_test).reshape((-1, 1))
))


# TODO: refact
# one of the ideas is to run the model over all the referred expressions and then calculate the final answer
#y_pred_A = lr.predict(df_A).reshape(-1, 1)
#y_pred_B = lr.predict(df_B).reshape(-1, 1)
#all_zero = np.zeros_like(y_pred_A)

#X_test_A = add_features(X_test, 'A', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']]
#X_test_B = add_features(X_test, 'B', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']]

#y_pred = np.hstack((y_pred_A,
#                    y_pred_B,
#                    all_zero
#                   ))
#y_true[(np.abs(y_true[:, 0] - y_true[:, 1]) < 0.1) & (y_true[:, 0] < .3), 2] = .5
    
log_loss(y_true, transform_to_submit(y_pred_, X_test_referred_expression))",No,4,55.0
y_true,No,5,41.0
"X_features = X.drop(columns='referred-expression')
X_referred_expression = X['referred-expression']

lr.fit(X_features, y)",Yes,3,7.0
"df_A = add_features(df, 'A', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']]
df_B = add_features(df, 'B', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']]

y_pred_A = lr.predict(df_A).reshape(-1, 1)
y_pred_B = lr.predict(df_B).reshape(-1, 1)
all_zero = np.zeros_like(y_pred_A)

y_pred = np.hstack((y_pred_A,
                    y_pred_B,
                    all_zero
                   ))

result = pd.DataFrame(y_pred, index=df.index, columns=['A', 'B', 'NEITHER'])

result.loc[((result['A'] - result['B']).abs() < 0.1) & (result['A'] < .3), 'NEITHER'] = .3

result.to_csv('lr_over_URL_similarity.csv')",Yes,3,27.0
"import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,5,88.0
"os.environ[""SEED""] = ""420""
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertConfig
import re
from tqdm import tqdm",No,5,23.0
"df_train = pd.read_csv(""/kaggle/input/gapvalidation/gap-test.tsv"", delimiter=""\\t"")
df_val = pd.read_csv(""/kaggle/input/gapvalidation/gap-validation.tsv"", delimiter=""\\t"")
df_test = pd.read_csv(""/kaggle/input/gapvalidation/gap-development.tsv"", delimiter=""\\t"")
test_2 = pd.read_csv(""/kaggle/input/gendered-pronoun-resolution/test_stage_2.tsv"", delimiter=""\\t"")

PRETRAINED_MODEL_NAME = 'bert-large-uncased'

bert_path = ""../input/bert-base-uncased/""
tokenizer = BertTokenizer.from_pretrained(bert_path)
pad_len = 300'",No,4,45.0
"tokenizer.add_tokens(['[A]', '[B]', '[P]'])
def insert_tag(row):
    to_be_inserted = sorted([
        (row[""A-offset""], "" [A] ""),
        (row[""B-offset""], "" [B] ""),
        (row[""Pronoun-offset""], "" [P] "")
    ], key=lambda x: x[0], reverse=True)
    text = row[""Text""]
    for offset, tag in to_be_inserted:
        text = text[:offset] + tag + text[offset:]
    return text

def tokenize(text, tokenizer):
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in (""[A]"", ""[B]"", ""[P]""):
            entries[token] = len(final_tokens)
            continue
        final_tokens.append(token)
    return final_tokens, (entries[""[A]""], entries[""[B]""], entries[""[P]""])

def target(row):
    if int(row['A-coref']) == 1:
        return 0
    elif int(row['B-coref']) == 1:
        return 1
    else:
        return 2
""""""
The lower part was taken from 
            [PyTorch] BERT + EndpointSpanExtractor + KFold
""""""
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())

def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b

def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)
            
def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))'",Yes,3,8.0
"class modified_dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        p_text = []
        offsets = []
        at_mask = []
        self.y_lst = df[['A-coref', 'B-coref']].apply(lambda row: target(row), axis = 1)
        for row in tqdm(range(len(df))):
            tokens, offset = tokenize(insert_tag(df.iloc[row]), tokenizer)
            bla = tokenizer.encode_plus(tokens, max_length = pad_len, pad_to_max_length = True, return_token_type_ids = False)
            p_text.append(bla['input_ids'])
            at_mask.append(bla['attention_mask'])
            offsets.append(offset)
        self.p_text = torch.tensor(p_text)
        self.offsets = torch.tensor(offsets)
        self.at_mask = torch.tensor(at_mask)
        return 
    def __len__(self):
        return len(self.p_text)
    def __getitem__(self,item):
        return self.p_text[item], self.y_lst[item], self.offsets[item], self.at_mask[item]

class modified_dataset_test(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        p_text = []
        offsets = []
        at_mask = []
        for row in tqdm(range(len(df))):
            tokens, offset = tokenize(insert_tag(df.iloc[row]), tokenizer)
            bla = tokenizer.encode_plus(tokens, max_length = pad_len, pad_to_max_length = True, return_token_type_ids = False)
            p_text.append(bla['input_ids'])
            at_mask.append(bla['attention_mask'])
            offsets.append(offset)
        self.p_text = torch.tensor(p_text)
        self.offsets = torch.tensor(offsets)
        self.at_mask = torch.tensor(at_mask)
        return  
    def __len__(self):
        return len(self.p_text)
    def __getitem__(self,item):
        return self.p_text[item], self.offsets[item], self.at_mask[item]
 
def collate_fun(batch):
    tmp_lst = list(zip(*batch))
    return torch.stack(tmp_lst[0], axis = 0), torch.tensor(tmp_lst[1]), torch.stack(tmp_lst[2], axis = 0), torch.stack(tmp_lst[3], axis = 0)

def collate_fun2(batch):
    tmp_lst = list(zip(*batch))
    return torch.stack(tmp_lst[0], axis = 0), torch.stack(tmp_lst[1], axis = 0), torch.stack(tmp_lst[2], axis = 0)

train_loader = DataLoader(
        modified_dataset(df_train, tokenizer),
        batch_size=18,
        collate_fn=collate_fun,
        shuffle=True,
        drop_last=True,
        num_workers=2)
val_loader = DataLoader(
        modified_dataset(df_val, tokenizer),
        batch_size=30,
        collate_fn=collate_fun,
        shuffle=False,
        num_workers=2)
test_loader = DataLoader(
        modified_dataset(df_test, tokenizer),
        batch_size=30,
        collate_fn=collate_fun,
        shuffle=False,
        num_workers=2)
test_2_loader = DataLoader(
        modified_dataset_test(test_2, tokenizer),
        batch_size=30,
        collate_fn=collate_fun2,
        shuffle=False,
        num_workers=2)",Yes,3,44.0
"subs  = df_weo['Subject Descriptor'].unique()[:-1]
df_weo_agg = df_weo[['Country']][df_weo['Country'].duplicated()==False].reset_index(drop=True)
for sub in subs[:]:
    df_tmp = df_weo[['Country', '2019']][df_weo['Subject Descriptor']==sub].reset_index(drop=True)
    df_tmp = df_tmp[df_tmp['Country'].duplicated()==False].reset_index(drop=True)
    df_tmp.columns = ['Country', sub]
    df_weo_agg = df_weo_agg.merge(df_tmp, on='Country', how='left')
df_weo_agg.columns = ["""".join (c if c.isalnum() else ""_"" for c in str(x)) for x in df_weo_agg.columns]
df_weo_agg.columns
df_weo_agg['Country_Region'] = df_weo_agg['Country']
df_weo_agg.head()'",No,3,14.0
"# merge
df_traintest5 = pd.merge(df_traintest4, df_weo_agg, on='Country_Region', how='left')
print(df_traintest5.shape)
df_traintest5.head()",No,3,32.0
"# add Life expectancy
# Life expectancy at birth obtained from http://hdr.undp.org/en/data
df_life = pd.read_csv(""../input/smokingstats/Life expectancy at birth.csv"")
tmp = df_life.iloc[:,1].values.tolist()
df_life = df_life[['Country', '2018']]
def func(x):
    x_new = 0
    try:
        x_new = float(x.replace("","", """"))
    except:
#         print(x)
        x_new = np.nan
    return x_new
    
df_life['2018'] = df_life['2018'].apply(lambda x: func(x))
df_life.head()'",Yes,2,45.0
"df_life = df_life[['Country', '2018']]
df_life.columns = ['Country_Region', 'LifeExpectancy']",No,3,14.0
"# merge
df_traintest6 = pd.merge(df_traintest5, df_life, on='Country_Region', how='left')
print(len(df_traintest6))
df_traintest6.head()",No,4,32.0
"# add additional info from countryinfo dataset
df_country = pd.read_csv(""../input/countryinfo/covid19countryinfo.csv"")
df_country.head()",No,4,45.0
"df_country['Country_Region'] = df_country['country']
df_country = df_country[df_country['country'].duplicated()==False]",No,3,19.0
print(df_country[df_country['country'].duplicated()].shape),No,5,38.0
df_country[df_country['country'].duplicated()],No,2,38.0
"df_traintest7 = pd.merge(df_traintest6, 
                         df_country.drop(['tests', 'testpop', 'country'], axis=1), 
                         on=['Country_Region',], how='left')
print(df_traintest7.shape)
df_traintest7.head()",No,3,32.0
"def encode_label(df, col, freq_limit=0):
    df[col][pd.isna(df[col])] = 'nan'
    tmp = df[col].value_counts()
    cols = tmp.index.values
    freq = tmp.values
    num_cols = (freq>=freq_limit).sum()
    print(""col: {}, num_cat: {}, num_reduced: {}"".format(col, len(cols), num_cols))

    col_new = '{}_le'.format(col)
    df_new = pd.DataFrame(np.ones(len(df), np.int16)*(num_cols-1), columns=[col_new])
    for i, item in enumerate(cols[:num_cols]):
        df_new[col_new][df[col]==item] = i

    return df_new

def get_df_le(df, col_index, col_cat):
    df_new = df[[col_index]]
    for col in col_cat:
        df_tmp = encode_label(df, col)
        df_new = pd.concat([df_new, df_tmp], axis=1)
    return df_new

df_traintest7['id'] = np.arange(len(df_traintest7))
df_le = get_df_le(df_traintest7, 'id', ['Country_Region', 'Province_State'])
df_traintest8 = pd.merge(df_traintest7, df_le, on='id', how='left')'",Yes,3,8.0
"df_traintest8['cases/day'] = df_traintest8['cases/day'].astype(np.float)
df_traintest8['fatal/day'] = df_traintest8['fatal/day'].astype(np.float)",No,5,16.0
"# covert object type to float
def func(x):
    x_new = 0
    try:
        x_new = float(x.replace("","", """"))
    except:
#         print(x)
        x_new = np.nan
    return x_new
cols = [
    'Gross_domestic_product__constant_prices', 
    'Gross_domestic_product__current_prices', 
    'Gross_domestic_product__deflator', 
    'Gross_domestic_product_per_capita__constant_prices', 
    'Gross_domestic_product_per_capita__current_prices', 
    'Output_gap_in_percent_of_potential_GDP', 
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__valuation_of_country_GDP', 
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__per_capita_GDP', 
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__share_of_world_total', 
    'Implied_PPP_conversion_rate', 'Total_investment', 
    'Gross_national_savings', 'Inflation__average_consumer_prices', 
    'Inflation__end_of_period_consumer_prices', 
    'Six_month_London_interbank_offered_rate__LIBOR_', 
    'Volume_of_imports_of_goods_and_services', 
    'Volume_of_Imports_of_goods', 
    'Volume_of_exports_of_goods_and_services', 
    'Volume_of_exports_of_goods', 'Unemployment_rate', 'Employment', 'Population', 
    'General_government_revenue', 'General_government_total_expenditure', 
    'General_government_net_lending_borrowing', 'General_government_structural_balance', 
    'General_government_primary_net_lending_borrowing', 'General_government_net_debt', 
    'General_government_gross_debt', 'Gross_domestic_product_corresponding_to_fiscal_year__current_prices', 
    'Current_account_balance', 'pop',
    'sex0',
'sex14',
'sex25',
'sex54',
'sex64',
'sex65plus',
'sexratio',
'lung',
'femalelung',
'malelung',
'gdp2019',
'healthexp',
'healthperpop',
'fertility',
'firstcase',
'totalcases',
'activecases',
'newcases',
'deaths',
'newdeaths',
'recovered',
'critical',
'casediv1m',
'deathdiv1m',
]
for col in cols:
    df_traintest8[col] = df_traintest8[col].apply(lambda x: func(x))  
print(df_traintest8['pop'].dtype)'",No,3,16.0
df_traintest8[df_traintest8['place_id']=='China/Hubei'].head(),No,5,41.0
"for col in df_traintest8.columns:
    print(""'{}',"".format(col))'",No,5,71.0
"day_before_valid = 71 # 3-11 day  before of validation
day_before_public = 78 # 3-18 last day of train
day_before_launch = 85 # 4-1 last day before launch",No,2,23.0
"def calc_score(y_true, y_pred):
    y_true[y_true<0] = 0
    score = metrics.mean_squared_error(np.log(y_true.clip(0, 1e10)+1), np.log(y_pred[:]+1))**0.5
    return score",No,5,84.0
"# train model to predict fatalities/day
# params
SEED = 42
params = {'num_leaves': 8,
          'min_data_in_leaf': 5,  # 42,
          'objective': 'regression',
          'max_depth': 8,
          'learning_rate': 0.02,
          'boosting': 'gbdt',
          'bagging_freq': 5,  # 5
          'bagging_fraction': 0.8,  # 0.5,
          'feature_fraction': 0.8201,
          'bagging_seed': SEED,
          'reg_alpha': 1,  # 1.728910519108444,
          'reg_lambda': 4.9847051755586085,
          'random_state': SEED,
          'metric': 'mse',
          'verbosity': 100,
          'min_gain_to_split': 0.02,  # 0.01077313523861969,
          'min_child_weight': 5,  # 19.428902804238373,
          'num_threads': 6,
          }
",No,5,59.0
"# train model to predict fatalities/day
# features are selected manually based on valid score
col_target = 'fatal/day'
col_var = [
    'Lat', 'Long',
    'days_since_1cases', 
#     'days_since_10cases', 
#     'days_since_100cases',
#     'days_since_1fatal', 
#     'days_since_10fatal', 'days_since_100fatal',
#     'cases/day_(1-1)', 
#     'cases/day_(1-7)', 
#     'cases/day_(8-14)',  
#     'cases/day_(15-21)', 
    
#     'fatal/day_(1-1)', 
#     'fatal/day_(1-7)', 
#     'fatal/day_(8-14)', 
#     'fatal/day_(15-21)', 
    'SmokingRate',
    'Gross_domestic_product__constant_prices',
    'Gross_domestic_product__current_prices',
    'Gross_domestic_product__deflator',
    'Gross_domestic_product_per_capita__constant_prices',
    'Gross_domestic_product_per_capita__current_prices',
    'Output_gap_in_percent_of_potential_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__valuation_of_country_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__per_capita_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__share_of_world_total',
    'Implied_PPP_conversion_rate', 'Total_investment',
    'Gross_national_savings', 'Inflation__average_consumer_prices',
    'Inflation__end_of_period_consumer_prices',
    'Six_month_London_interbank_offered_rate__LIBOR_',
    'Volume_of_imports_of_goods_and_services', 'Volume_of_Imports_of_goods',
    'Volume_of_exports_of_goods_and_services', 'Volume_of_exports_of_goods',
    'Unemployment_rate', 
    'Employment', 'Population',
    'General_government_revenue', 'General_government_total_expenditure',
    'General_government_net_lending_borrowing',
    'General_government_structural_balance',
    'General_government_primary_net_lending_borrowing',
    'General_government_net_debt', 'General_government_gross_debt',
    'Gross_domestic_product_corresponding_to_fiscal_year__current_prices',
    'Current_account_balance', 
    'LifeExpectancy',
#     'pop',
    'density', 
    'medianage', 
    'urbanpop', 
    'hospibed', 
#     'smokers', 
'sex0',
'sex14',
'sex25',
'sex54',
'sex64',
'sex65plus',
'sexratio',
'lung',
'femalelung',
'malelung',
'gdp2019',
'healthexp',
'healthperpop',
]
col_cat = []
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_valid)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (day_before_valid<df_traintest8['day']) & (df_traintest8['day']<=day_before_public)]
df_test = df_traintest8[pd.isna(df_traintest8['ForecastId'])==False]
X_train = df_train[col_var]
X_valid = df_valid[col_var]
y_train = np.log(df_train[col_target].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
num_round = 15000
model = lgb.train(params, train_data, num_round, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)

best_itr = model.best_iteration",No,4,13.0
"y_true = df_valid['fatal/day'].values
y_pred = np.exp(model.predict(X_valid))-1
score = calc_score(y_true, y_pred)
print(""{:.6f}"".format(score))'",No,4,28.0
"# display feature importance
tmp = pd.DataFrame()
tmp[""feature""] = col_var
tmp[""importance""] = model.feature_importance()
tmp = tmp.sort_values('importance', ascending=False)
tmp.head(30)'",No,4,79.0
"# train with all data before public
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_test = df_traintest8[pd.isna(df_traintest8['ForecastId'])==False]
X_train = df_train[col_var]
X_valid = df_valid[col_var]
y_train = np.log(df_train[col_target].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model = lgb.train(params, train_data, best_itr, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)",No,4,7.0
"# train model to predict fatalities/day
col_target2 = 'cases/day'
col_var2 = [
    'Lat', 'Long',
    'days_since_1cases', 
#     'days_since_10cases', 
#     'days_since_100cases',
#     'days_since_1fatal', 
#     'days_since_10fatal', 'days_since_100fatal',
#     'cases/day_(1-1)', 
#     'cases/day_(1-7)', 
#     'cases/day_(8-14)',  
#     'cases/day_(15-21)', 
    
#     'fatal/day_(1-1)', 
#     'fatal/day_(1-7)', 
#     'fatal/day_(8-14)', 
#     'fatal/day_(15-21)', 
    'SmokingRate',
    'Gross_domestic_product__constant_prices',
    'Gross_domestic_product__current_prices',
    'Gross_domestic_product__deflator',
    'Gross_domestic_product_per_capita__constant_prices',
    'Gross_domestic_product_per_capita__current_prices',
    'Output_gap_in_percent_of_potential_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__valuation_of_country_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__per_capita_GDP',
    'Gross_domestic_product_based_on_purchasing_power_parity__PPP__share_of_world_total',
    'Implied_PPP_conversion_rate', 'Total_investment',
    'Gross_national_savings', 'Inflation__average_consumer_prices',
    'Inflation__end_of_period_consumer_prices',
    'Six_month_London_interbank_offered_rate__LIBOR_',
    'Volume_of_imports_of_goods_and_services', 'Volume_of_Imports_of_goods',
    'Volume_of_exports_of_goods_and_services', 'Volume_of_exports_of_goods',
    'Unemployment_rate', 
    'Employment', 'Population',
    'General_government_revenue', 'General_government_total_expenditure',
    'General_government_net_lending_borrowing',
    'General_government_structural_balance',
    'General_government_primary_net_lending_borrowing',
    'General_government_net_debt', 'General_government_gross_debt',
    'Gross_domestic_product_corresponding_to_fiscal_year__current_prices',
    'Current_account_balance', 
    'LifeExpectancy',
#     'pop',
    'density', 
    'medianage', 
    'urbanpop', 
    'hospibed', 
'sex0',
'sex14',
'sex25',
'sex54',
'sex64',
'sex65plus',
'sexratio',
'lung',
'femalelung',
'malelung',
'gdp2019',
'healthexp',
'healthperpop',  
]
col_cat = []
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_valid)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (day_before_valid<df_traintest8['day']) & (df_traintest8['day']<=day_before_public)]
X_train = df_train[col_var2]
X_valid = df_valid[col_var2]
y_train = np.log(df_train[col_target2].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target2].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model2 = lgb.train(params, train_data, num_round, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)
best_itr2 = model2.best_iteration",No,4,13.0
"# display feature importance
tmp = pd.DataFrame()
tmp[""feature""] = col_var2
tmp[""importance""] = model2.feature_importance()
tmp = tmp.sort_values('importance', ascending=False)
tmp.head(30)'",No,3,79.0
"y_true = df_valid['cases/day'].values
y_pred = np.exp(model2.predict(X_valid))-1
score = calc_score(y_true, y_pred)
print(""{:.6f}"".format(score))
'",No,4,28.0
"df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
X_train = df_train[col_var2]
X_valid = df_valid[col_var2]
y_train = np.log(df_train[col_target2].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target2].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model2 = lgb.train(params, train_data, best_itr2, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)",No,3,7.0
"# train model to predict fatalities/day
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (day_before_public<df_traintest8['day'])]
df_test = df_traintest8[pd.isna(df_traintest8['ForecastId'])==False]
X_train = df_train[col_var]
X_valid = df_valid[col_var]
y_train = np.log(df_train[col_target].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
num_round = 15000
model = lgb.train(params, train_data, num_round, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)

best_itr = model.best_iteration",No,2,13.0
"# train with all data
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId']))]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId']))]
X_train = df_train[col_var]
X_valid = df_valid[col_var]
y_train = np.log(df_train[col_target].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model_pri = lgb.train(params, train_data, best_itr, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)",No,3,7.0
"# train model to predict cases/day
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (day_before_public<df_traintest8['day'])]
X_train = df_train[col_var2]
X_valid = df_valid[col_var2]
y_train = np.log(df_train[col_target2].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target2].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model2 = lgb.train(params, train_data, num_round, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)
best_itr2 = model2.best_iteration",No,2,13.0
"# train with all data
df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_public)]
X_train = df_train[col_var2]
X_valid = df_valid[col_var2]
y_train = np.log(df_train[col_target2].values.clip(0, 1e10)+1)
y_valid = np.log(df_valid[col_target2].values.clip(0, 1e10)+1)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=col_cat)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=col_cat)
model2_pri = lgb.train(params, train_data, best_itr2, valid_sets=[train_data, valid_data],
                  verbose_eval=100,
                  early_stopping_rounds=150,)",No,3,7.0
"# remove overlap for public LB prediction
df_tmp = df_traintest8[
    ((df_traintest8['day']<=day_before_public)  & (pd.isna(df_traintest8['ForecastId'])))
    | ((day_before_public<df_traintest8['day']) & (pd.isna(df_traintest8['ForecastId'])==False))].reset_index(drop=True)
df_tmp = df_tmp.drop([
    'cases/day_(1-1)', 'cases/day_(1-7)', 'cases/day_(8-14)', 'cases/day_(15-21)', 
    'fatal/day_(1-1)', 'fatal/day_(1-7)', 'fatal/day_(8-14)', 'fatal/day_(15-21)',
    'days_since_1cases', 'days_since_10cases', 'days_since_100cases',
    'days_since_1fatal', 'days_since_10fatal', 'days_since_100fatal',
                               ],  axis=1)
df_traintest9 = []
for i, place in enumerate(places[:]):
    df_tmp2 = df_tmp[df_tmp['place_id']==place].reset_index(drop=True)
    df_tmp2 = do_aggregations(df_tmp2)
    df_traintest9.append(df_tmp2)
df_traintest9 = pd.concat(df_traintest9).reset_index(drop=True)
df_traintest9[df_traintest9['day']>day_before_public-2].head()",No,5,14.0
"# remove overlap for private LB prediction
df_tmp = df_traintest8[
    ((df_traintest8['day']<=day_before_private)  & (pd.isna(df_traintest8['ForecastId'])))
    | ((day_before_private<df_traintest8['day']) & (pd.isna(df_traintest8['ForecastId'])==False))].reset_index(drop=True)
df_tmp = df_tmp.drop([
    'cases/day_(1-1)', 'cases/day_(1-7)', 'cases/day_(8-14)', 'cases/day_(15-21)', 
    'fatal/day_(1-1)', 'fatal/day_(1-7)', 'fatal/day_(8-14)', 'fatal/day_(15-21)',
    'days_since_1cases', 'days_since_10cases', 'days_since_100cases',
    'days_since_1fatal', 'days_since_10fatal', 'days_since_100fatal',
                               ],  axis=1)
df_traintest10 = []
for i, place in enumerate(places[:]):
    df_tmp2 = df_tmp[df_tmp['place_id']==place].reset_index(drop=True)
    df_tmp2 = do_aggregations(df_tmp2)
    df_traintest10.append(df_tmp2)
df_traintest10 = pd.concat(df_traintest10).reset_index(drop=True)
df_traintest10[df_traintest10['day']>day_before_private-2].head()",No,4,14.0
"# predict test data in public
# predict the cases and fatatilites one day at a time and use the predicts as next day's feature recursively.
df_preds = []
for i, place in enumerate(places[:]):
    df_interest = copy.deepcopy(df_traintest9[df_traintest9['place_id']==place].reset_index(drop=True))
    df_interest['cases/day'][(pd.isna(df_interest['ForecastId']))==False] = -1
    df_interest['fatal/day'][(pd.isna(df_interest['ForecastId']))==False] = -1
    len_known = (df_interest['day']<=day_before_public).sum()
    len_unknown = (day_before_public<df_interest['day']).sum()
    for j in range(len_unknown): # use predicted cases and fatal for next days' prediction
        X_valid = df_interest[col_var].iloc[j+len_known]
        X_valid2 = df_interest[col_var2].iloc[j+len_known]
        pred_f = model.predict(X_valid)
        pred_c = model2.predict(X_valid2)
        pred_c = (np.exp(pred_c)-1).clip(0, 1e10)
        pred_f = (np.exp(pred_f)-1).clip(0, 1e10)
        df_interest['fatal/day'][j+len_known] = pred_f
        df_interest['cases/day'][j+len_known] = pred_c
        df_interest['Fatalities'][j+len_known] = df_interest['Fatalities'][j+len_known-1] + pred_f
        df_interest['ConfirmedCases'][j+len_known] = df_interest['ConfirmedCases'][j+len_known-1] + pred_c
#         print(df_interest['ConfirmedCases'][j+len_known-1], df_interest['ConfirmedCases'][j+len_known], pred_c)
        df_interest = df_interest.drop([
            'cases/day_(1-1)', 'cases/day_(1-7)', 'cases/day_(8-14)', 'cases/day_(15-21)', 
            'fatal/day_(1-1)', 'fatal/day_(1-7)', 'fatal/day_(8-14)', 'fatal/day_(15-21)',
            'days_since_1cases', 'days_since_10cases', 'days_since_100cases',
            'days_since_1fatal', 'days_since_10fatal', 'days_since_100fatal',

                                       ],  axis=1)
        df_interest = do_aggregations(df_interest)
    if (i+1)%10==0:
        print(""{:3d}/{}  {}, len known: {}, len unknown: {}"".format(i+1, len(places), place, len_known, len_unknown), df_interest.shape)
    df_interest['fatal_pred'] = np.cumsum(df_interest['fatal/day'].values)
    df_interest['cases_pred'] = np.cumsum(df_interest['cases/day'].values)
    df_preds.append(df_interest)
df_preds = pd.concat(df_preds)'",Yes,2,27.0
"# predict test data in public
df_preds_pri = []
for i, place in enumerate(places[:]):
    df_interest = copy.deepcopy(df_traintest10[df_traintest10['place_id']==place].reset_index(drop=True))
    df_interest['cases/day'][(pd.isna(df_interest['ForecastId']))==False] = -1
    df_interest['fatal/day'][(pd.isna(df_interest['ForecastId']))==False] = -1
    len_known = (df_interest['day']<=day_before_private).sum()
    len_unknown = (day_before_private<df_interest['day']).sum()
    for j in range(len_unknown): # use predicted cases and fatal for next days' prediction
        X_valid = df_interest[col_var].iloc[j+len_known]
        X_valid2 = df_interest[col_var2].iloc[j+len_known]
        pred_f = model_pri.predict(X_valid)
        pred_c = model2_pri.predict(X_valid2)
        pred_c = (np.exp(pred_c)-1).clip(0, 1e10)
        pred_f = (np.exp(pred_f)-1).clip(0, 1e10)
        df_interest['fatal/day'][j+len_known] = pred_f
        df_interest['cases/day'][j+len_known] = pred_c
        df_interest['Fatalities'][j+len_known] = df_interest['Fatalities'][j+len_known-1] + pred_f
        df_interest['ConfirmedCases'][j+len_known] = df_interest['ConfirmedCases'][j+len_known-1] + pred_c
#         print(df_interest['ConfirmedCases'][j+len_known-1], df_interest['ConfirmedCases'][j+len_known], pred_c)
        df_interest = df_interest.drop([
            'cases/day_(1-1)', 'cases/day_(1-7)', 'cases/day_(8-14)', 'cases/day_(15-21)', 
            'fatal/day_(1-1)', 'fatal/day_(1-7)', 'fatal/day_(8-14)', 'fatal/day_(15-21)',
            'days_since_1cases', 'days_since_10cases', 'days_since_100cases',
            'days_since_1fatal', 'days_since_10fatal', 'days_since_100fatal',

                                       ],  axis=1)
        df_interest = do_aggregations(df_interest)
    if (i+1)%10==0:
        print(""{:3d}/{}  {}, len known: {}, len unknown: {}"".format(i+1, len(places), place, len_known, len_unknown), df_interest.shape)
    df_interest['fatal_pred'] = np.cumsum(df_interest['fatal/day'].values)
    df_interest['cases_pred'] = np.cumsum(df_interest['cases/day'].values)
    df_preds_pri.append(df_interest)
df_preds_pri = pd.concat(df_preds_pri)'",Yes,2,27.0
"places_sort = df_traintest10[['place_id', 'ConfirmedCases']][df_traintest10['day']==day_before_private]
places_sort = places_sort.sort_values('ConfirmedCases', ascending=False).reset_index(drop=True)['place_id'].values
print(len(places_sort))
places_sort[:5]",No,3,9.0
"print(""Fatalities / Public"")
plt.figure(figsize=(30,30))
for i in range(30):
    plt.subplot(5,6,i+1)
    idx = i * 10
    df_interest = df_preds[df_preds['place_id']==places_sort[idx]].reset_index(drop=True)
    tmp = df_interest['fatal/day'].values
    tmp = np.cumsum(tmp)
    sns.lineplot(x=df_interest['day'], y=tmp, label='pred')
    df_interest2 = df_traintest10[(df_traintest10['place_id']==places_sort[idx]) & (df_traintest10['day']<=day_before_private)].reset_index(drop=True)
    sns.lineplot(x=df_interest2['day'].values, y=df_interest2['Fatalities'].values, label='true')
    plt.title(places_sort[idx])
plt.show()'",No,4,33.0
"print(""Confirmed Cases / Public"")
plt.figure(figsize=(30,30))
for i in range(30):
    plt.subplot(5,6,i+1)
    idx = i * 10
    df_interest = df_preds[df_preds['place_id']==places_sort[idx]].reset_index(drop=True)
    tmp = df_interest['cases/day'].values
    tmp = np.cumsum(tmp)
    sns.lineplot(x=df_interest['day'], y=tmp, label='pred')
    df_interest2 = df_traintest10[(df_traintest10['place_id']==places_sort[idx]) & (df_traintest10['day']<=day_before_private)].reset_index(drop=True)
    sns.lineplot(x=df_interest2['day'].values, y=df_interest2['ConfirmedCases'].values, label='true')
    plt.title(places_sort[idx])
plt.show()'",No,4,33.0
"print(""Fatalities / Private"")
plt.figure(figsize=(30,30))
for i in range(30):
    plt.subplot(5,6,i+1)
    idx = i * 10
    df_interest = df_preds_pri[df_preds_pri['place_id']==places_sort[idx]].reset_index(drop=True)
    tmp = df_interest['fatal/day'].values
    tmp = np.cumsum(tmp)
    sns.lineplot(x=df_interest['day'], y=tmp, label='pred')
    df_interest2 = df_traintest10[(df_traintest10['place_id']==places_sort[idx]) & (df_traintest10['day']<=day_before_private)].reset_index(drop=True)
    sns.lineplot(x=df_interest2['day'].values, y=df_interest2['Fatalities'].values, label='true')
    plt.title(places_sort[idx])
plt.show()'",No,3,33.0
"print(""ConfirmedCases / Private"")
plt.figure(figsize=(30,30))
for i in range(30):
    plt.subplot(5,6,i+1)
    idx = i * 10
    df_interest = df_preds_pri[df_preds_pri['place_id']==places_sort[idx]].reset_index(drop=True)
    tmp = df_interest['cases/day'].values
    tmp = np.cumsum(tmp)
    sns.lineplot(x=df_interest['day'], y=tmp, label='pred')
    df_interest2 = df_traintest10[(df_traintest10['place_id']==places_sort[idx]) & (df_traintest10['day']<=day_before_private)].reset_index(drop=True)
    sns.lineplot(x=df_interest2['day'].values, y=df_interest2['ConfirmedCases'].values, label='true')
    plt.title(places_sort[idx])
plt.show()'",No,4,33.0
"# merge 2 preds
df_preds[df_preds['day']>day_before_private] = df_preds_pri[df_preds['day']>day_before_private]",No,5,14.0
"df_preds.to_csv(""df_preds.csv"", index=None)",No,5,25.0
"# load sample submission
df_sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")
print(len(df_sub))
df_sub.head()",No,3,45.0
"# merge prediction with sub
df_sub = pd.merge(df_sub, df_traintest3[['ForecastId', 'place_id', 'day']])
df_sub = pd.merge(df_sub, df_preds[['place_id', 'day', 'cases_pred', 'fatal_pred']], on=['place_id', 'day',], how='left')
df_sub.head(10)",No,4,32.0
"# save
df_sub['ConfirmedCases'] = df_sub['cases_pred']
df_sub['Fatalities'] = df_sub['fatal_pred']
df_sub = df_sub[['ForecastId', 'ConfirmedCases', 'Fatalities']]
df_sub.to_csv(""submission.csv"", index=None)
df_sub.head(10)'",No,3,25.0
%matplotlib inline,Yes,3,22.0
"# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb",No,5,23.0
"# get rossmann, store, & test csv files as a DataFrame
rossmann_df  = pd.read_csv(""../input/train.csv"")
store_df     = pd.read_csv(""../input/store.csv"")
test_df      = pd.read_csv(""../input/test.csv"")

# preview the data
rossmann_df.head()",No,4,45.0
"rossmann_df.info()
print(""----------------------------"")
store_df.info()
print(""----------------------------"")
test_df.info()",No,5,40.0
"# Open
fig, (axis1) = plt.subplots(1,1,figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek', data=rossmann_df,palette=""husl"", ax=axis1)

# fill NaN values in test_df with Open=1 if DayOfWeek != 7
test_df[""Open""][test_df[""Open""] != test_df[""Open""]] = (test_df[""DayOfWeek""] != 7).astype(int)

# Drop Open column
# rossmann_df.drop(""Open"", axis=1, inplace=True)
# test_df.drop(""Open"", axis=1, inplace=True)'",Yes,2,17.0
"# Date

# Create Year and Month columns
rossmann_df['Year']  = rossmann_df['Date'].apply(lambda x: int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x: int(str(x)[5:7]))

test_df['Year']  = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7]))

# Assign Date column to Date(Year-Month) instead of (Year-Month-Day)
# this column will be useful in analysis and visualization
rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x: (str(x)[:7]))
test_df['Date']     = test_df['Date'].apply(lambda x: (str(x)[:7]))

# group by date and get average sales, and precent change
average_sales    = rossmann_df.groupby('Date')[""Sales""].mean()
pct_change_sales = rossmann_df.groupby('Date')[""Sales""].sum().pct_change()

fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))

# plot average sales over time(year-month)
ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title=""Average Sales"")
ax1.set_xticks(range(len(average_sales)))
ax1.set_xticklabels(average_sales.index.tolist(), rotation=90)

# plot precent change for sales over time(year-month)
ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap=""summer"",title=""Sales Percent Change"")
# ax2.set_xticks(range(len(pct_change_sales)))
# ax2.set_xticklabels(pct_change_sales.index.tolist(), rotation=90)'",Yes,2,8.0
"# .... contiune with Date

# Plot average sales & customers for every year
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2)

# Drop Date column
# rossmann_df.drop(['Date'], axis=1,inplace=True)
# test_df.drop(['Date'], axis=1,inplace=True)",No,5,75.0
"# Customers

fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))

# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df[""Customers""]], whis=np.inf, ax=axis1)

# group by date and get average customers, and precent change
average_customers      = rossmann_df.groupby('Date')[""Customers""].mean()
# pct_change_customers = rossmann_df.groupby('Date')[""Customers""].sum().pct_change()

# Plot average customers over the time
# it should be correlated with the average sales over time
ax = average_customers.plot(legend=True,marker='o', ax=axis2)
ax.set_xticks(range(len(average_customers)))
xlabels = ax.set_xticklabels(average_customers.index.tolist(), rotation=90)'",No,3,33.0
"# DayOfWeek
# In both cases where the store is closed and opened

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='DayOfWeek', y='Sales', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis2)",No,5,75.0
"# Promo

# Plot average sales & customers with/without promo
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Promo', y='Customers', data=rossmann_df, ax=axis2)",No,5,33.0
"# StateHoliday

# StateHoliday column has values 0 & ""0"", So, we need to merge values with 0 to ""0""
rossmann_df[""StateHoliday""].loc[rossmann_df[""StateHoliday""] == 0] = ""0""
# test_df[""StateHoliday""].loc[test_df[""StateHoliday""] == 0] = ""0""

# Plot
sns.countplot(x='StateHoliday', data=rossmann_df)

# Before
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)

mask = (rossmann_df[""StateHoliday""] != ""0"") & (rossmann_df[""Sales""] > 0)
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)'",No,5,33.0
"# .... continue with StateHoliday

# After
rossmann_df[""StateHoliday""] = rossmann_df[""StateHoliday""].map({0: 0, ""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})
test_df[""StateHoliday""]     = test_df[""StateHoliday""].map({0: 0, ""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)'",No,5,33.0
"# SchoolHoliday

# Plot
sns.countplot(x='SchoolHoliday', data=rossmann_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)",No,5,33.0
"# Sales

fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))

# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df[""Customers""]], whis=np.inf, ax=axis1)

# Plot sales values 
# Notice that values with 0 is mostly because the store was closed
rossmann_df[""Sales""].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)'",No,5,33.0
"# Using store_df

# Merge store_df with average store sales & customers
average_sales_customers = rossmann_df.groupby('Store')[[""Sales"", ""Customers""]].mean()
sales_customers_df = DataFrame({'Store':average_sales_customers.index,
                      'Sales':average_sales_customers[""Sales""], 'Customers': average_sales_customers[""Customers""]}, 
                      columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')

store_df.head()'",No,3,8.0
"# StoreType 

# Plot StoreType, & StoreType Vs average sales and customers

sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1)
sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)",No,5,33.0
"# Assortment 

# Plot Assortment, & Assortment Vs average sales and customers

sns.countplot(x='Assortment', data=store_df, order=['a','b','c'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1)
sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)",No,5,33.0
"# Promo2

# Plot Promo2, & Promo2 Vs average sales and customers

sns.countplot(x='Promo2', data=store_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1)
sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)",No,5,33.0
"# CompetitionDistance

# fill NaN values
store_df[""CompetitionDistance""].fillna(store_df[""CompetitionDistance""].median())

# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))'",Yes,3,33.0
"# What happened to the average sales of a store over time when competition started?
# Example: the average sales for store_id = 6 has dramatically decreased since the competition started

store_id = 6
store_data = rossmann_df[rossmann_df[""Store""] == store_id]

average_store_sales = store_data.groupby('Date')[""Sales""].mean()

# Get year, and month when Competition started
y = store_df[""CompetitionOpenSinceYear""].loc[store_df[""Store""]  == store_id].values[0]
m = store_df[""CompetitionOpenSinceMonth""].loc[store_df[""Store""] == store_id].values[0]

# Plot 
ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o')
ax.set_xticks(range(len(average_store_sales)))
ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90)

# Since all data of store sales given in rossmann_df starts with year=2013 till 2015,
# So, we need to check if year>=2013 and y & m aren't NaN values.
if y >= 2013 and y == y and m == m:
    plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')'",No,4,33.0
"# Risk Analysis
# Analyze the risk of a store; Risk(std) Vs Expected(mean)

# .... countiue using store_data
store_average = store_data[""Sales""].mean()
store_std     = store_data[""Sales""].std()

# Plot
plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20)

# Get min & max mean and std of store sales
# Remember that store_df[""Sales""] has the average sales for a store
std_sales = rossmann_df.groupby('Store')[""Sales""].std()

min_average = store_df[""Sales""].min()
max_average = store_df[""Sales""].max()
min_std     = std_sales.min()
max_std     = std_sales.max()

# Set the x and y limits of the plot
plt.ylim([min_std, max_std])
plt.xlim([min_average, max_average])

# Set the plot axis titles
plt.xlabel('Expected Sales')
plt.ylabel('Risk')

# Set label
label, x, y = ""Store {}"".format(store_id), store_average, store_std
plt.annotate(
        label, 
        xy = (x, y), xytext = (50, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))'",No,4,33.0
"# .... continue Correlation

# Plot correlation between range of stores
start_store = 1
end_store   = 5

fig, (axis1) = plt.subplots(1,1,figsize=(15,5))

# using summation of sales values for each store 
sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)

# using percent change for each store
# sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)",No,5,80.0
"# Notice that test_df has only year=2015, and months 8 & 9

# drop Year and Month
rossmann_df.drop([""Year"", ""Month""], axis=1, inplace=True)
test_df.drop([""Year"", ""Month""], axis=1, inplace=True)

# Create dummy varibales for DayOfWeek
day_dummies_rossmann  = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)

day_dummies_test  = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)

rossmann_df = rossmann_df.join(day_dummies_rossmann)
test_df     = test_df.join(day_dummies_test)

rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)'",No,3,8.0
"# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df[""Open""] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop([""Open"",""Customers"", ""Date""], axis=1, inplace=True)'",No,3,10.0
"# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df[""Id""][test_df[""Open""] == 0].values

# remove all rows(store,date) that were closed
test_df = test_df[test_df[""Open""] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)'",Yes,2,14.0
"# Loop through each store, 
# train the model using the data of current store, and predict it's sales values.

rossmann_dic = dict(list(rossmann_df.groupby('Store')))
test_dic     = dict(list(test_df.groupby('Store')))
submission   = Series()
scores       = []

for i in test_dic:
    
    # current store
    store = rossmann_dic[i]
    
    # define training and testing sets
    X_train = store.drop([""Sales"",""Store""],axis=1)
    Y_train = store[""Sales""]
    X_test  = test_dic[i].copy()
    
    store_ids = X_test[""Id""]
    X_test.drop([""Id"",""Store""], axis=1,inplace=True)
    
    # Linear Regression
    lreg = LinearRegression()
    lreg.fit(X_train, Y_train)
    Y_pred = lreg.predict(X_test)
    scores.append(lreg.score(X_train, Y_train))

    # Xgboost
    # params = {""objective"": ""reg:linear"",  ""max_depth"": 10}
    # T_train_xgb = xgb.DMatrix(X_train, Y_train)
    # X_test_xgb  = xgb.DMatrix(X_test)
    # gbm = xgb.train(params, T_train_xgb, 100)
    # Y_pred = gbm.predict(X_test_xgb)
    
    # append predicted values of current store to submission
    submission = submission.append(Series(Y_pred, index=store_ids))

# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))

# save to csv file
submission = pd.DataFrame({ ""Id"": submission.index, ""Sales"": submission.values})
submission.to_csv('rossmann.csv', index=False)'",Yes,2,7.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 6)",No,5,23.0
"df = pd.read_csv(""../input/train.csv"")",No,5,45.0
df.head().T,No,5,41.0
df.shape,No,5,58.0
"print (df.Store.unique())
print (df.Store.nunique())",No,3,54.0
df['Year'] = df.Date.map(lambda x: int(x[:4])),No,5,16.0
df['Month']  =df.Date.map(lambda x: int(x[5:7])),No,5,16.0
df.Promo.unique(),No,5,57.0
df.StateHoliday.unique(),No,5,57.0
df.groupby(['StateHoliday'])['Open'].agg('mean'),No,5,60.0
"df.StateHoliday = df['StateHoliday'].map(lambda x: (int)(x in ['a','b','c']))",No,5,20.0
df.groupby('StateHoliday')['Open'].mean(),No,5,60.0
df.SchoolHoliday.unique(),No,5,57.0
"plt.subplot('121')
df['Sales'].hist(bins=100)
plt.subplot('122')
df['Customers'].hist(bins=100)",No,5,33.0
df.DayOfWeek.unique(),No,5,57.0
df.DayOfWeek[df.Open == 1].value_counts(),No,3,54.0
y = df.groupby('DayOfWeek')['Sales'].sum(),No,5,60.0
"sns.barplot(x=np.arange(7)+1, y=y)",No,5,33.0
"xx = df.groupby(['Promo', 'Month'])['Sales'].sum()
xx",No,3,60.0
"plt.plot(xx[0], 'g', label='no promo')
plt.plot(xx[1], 'b', label='promo')",No,5,33.0
"yms = df.groupby(['Year', 'Month'])['Sales'].sum()",No,5,60.0
"sns.barplot(x=np.arange(12)+1, y=yms[2013].sort_index())",No,5,33.0
"sns.barplot(x=np.arange(12)+1, y=yms[2014].sort_index())",No,5,33.0
"sns.barplot(x=np.arange(12)+1, y=yms[2014].sort_index() + yms[2013].sort_index())",No,5,33.0
"df_store = pd.read_csv(""../input/store.csv"")",No,5,45.0
df_store.head().T,No,5,41.0
df_store.StoreType.unique(),No,5,57.0
df_store.Assortment.unique(),No,5,57.0
"from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder",No,5,22.0
"df_store.fillna(value=0, inplace=True)",No,5,17.0
le = LabelEncoder(),No,2,4.0
df_store.StoreType = le.fit_transform(df_store.StoreType),No,5,20.0
df_store.Assortment = le.fit_transform(df_store.Assortment),No,5,20.0
df_store.PromoInterval.unique(),No,5,57.0
"dict1 = {0: 0, 'Jan,Apr,Jul,Oct': 1, 'Feb,May,Aug,Nov': 2, 'Mar,Jun,Sept,Dec': 3}",No,2,23.0
"df_store.PromoInterval = df_store.PromoInterval.apply(lambda x: dict1[x], convert_dtype=False)",No,4,8.0
"df_store.drop('Store', axis=1, inplace=True)",No,5,10.0
"df = df.join(df_store, on='Store')",No,5,32.0
df.PromoInterval = df.PromoInterval.map(lambda x: int(x)),No,5,16.0
print (df['StoreType'].value_counts() / len(df)),No,5,72.0
print (df['Assortment'].value_counts() / len(df)),No,5,72.0
"sales8 = df.groupby(['Store', 'Year', 'Month'])['Sales'].mean()[8]",No,5,60.0
"plt.plot(range(31), sales8.values)
plt.plot([df.loc[7].CompetitionOpenSinceMonth+12, df.loc[7].CompetitionOpenSinceMonth+12], [sales8.min(), sales8.max()])",No,5,33.0
"df_t = df.drop(['Customers', 'Date', 'Year'], axis=1)",No,5,10.0
df_t.head(10).T,No,5,41.0
df_t = df_t[df_t.Open == 1],No,5,14.0
df_t.shape,No,5,58.0
"df_t.drop('Open', axis=1, inplace=True)",No,5,10.0
"from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve",No,5,22.0
"b""X_train, X_test, y_train, y_test = train_test_split(df_t.drop('Sales', axis=1).values, df_t.Sales.values,\\\n                                                    test_size = 0.2)""",No,5,13.0
from sklearn.ensemble import RandomForestRegressor,No,5,22.0
"params = range(50, 201, 50)",No,3,5.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(), X_train[:20000],\\\n                                   y_train[:20000], 'n_estimators', params, \\\n                                   cv=5, scoring='r2', verbose=2)""",No,4,6.0
"scores_mean = scores.mean(axis=1)
scores_std = scores.std(axis=1)
tst_scr_mean = tst_scr.mean(axis=1)
tst_scr_std = tst_scr.std(axis=1)
plt.plot(params, tst_scr_mean)
plt.fill_between(params, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3)
plt.plot(params, scores_mean)
plt.fill_between(params, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)",No,3,33.0
"params = range(3, 9)",No,5,5.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=100), X_train[:20000], \\\n                                   y_train[:20000], 'max_features', params, \\\n                                   cv=3, scoring='r2', verbose=2)""",No,5,6.0
"params = range(5, 51, 5)",No,5,5.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=100), X_train[:20000], \\\n                                   y_train[:20000], 'max_depth', params, \\\n                                   cv=3, scoring='r2', verbose=2)""",No,4,6.0
"model1 = RandomForestRegressor(n_estimators=100, max_depth=20, n_jobs=4, verbose=2)",No,5,4.0
"model1.fit(X_train, y_train)",No,5,7.0
idx = model1.feature_importances_.argsort()[::-1],No,4,79.0
"ax = sns.barplot(x=df_t.drop('Sales', axis=1).columns[idx], y=model1.feature_importances_[idx])
_ = plt.setp(ax.get_xticklabels(), rotation=-90)",No,5,79.0
y_pred = model1.predict(X_test),No,5,48.0
"from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score",No,5,22.0
"mean_absolute_error(y_test, y_pred)",No,5,49.0
"r2_score(y_test, y_pred)",No,5,49.0
df_test = pd.read_csv('../input/test.csv'),No,5,45.0
df_test['Year'] = df_test.Date.map(lambda x: int(x[:4])),No,5,8.0
df_test['Month'] = df_test.Date.map(lambda x: int(x[5:7])),No,5,8.0
"df_test.StateHoliday = df_test['StateHoliday'].map(lambda x: (int)(x in ['a','b','c']))",No,4,8.0
"df_test = df_test.join(df_store, on='Store')",No,5,32.0
"df_test.drop(['Id', 'Date', 'Year'], axis=1, inplace=True)",No,5,10.0
"df_test.fillna(0, inplace=True)",No,5,17.0
df_test.head().T,No,5,41.0
df_test.Open.unique(),No,5,57.0
"ind_open = df_test.Open == 1
ind_closed = df_test.Open == 0
df_test2 = df_test[ind_open]",No,5,14.0
"df_test2.drop('Open', axis=1, inplace=True)",No,5,10.0
df_test2.head().T,No,5,41.0
X_out = df_test2.values,No,2,16.0
y_out = model1.predict(X_out),No,5,48.0
"df_out = pd.DataFrame(np.zeros(len(df_test)), columns=['Sales'])",No,5,12.0
"df_out[ind_open] = y_out.reshape(-1,1)",No,5,84.0
"df_out.set_index(np.arange(len(df_out))+1, inplace=True)",No,5,55.0
df_out.index.name = 'Id',No,5,55.0
df_out.head(),No,5,41.0
df_out.to_csv('out5.csv'),No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline",No,5,23.0
"df_store = pd.read_csv('../input/store.csv', index_col='Store')",No,5,45.0
df_store.head(),No,5,41.0
df_store.info(),No,5,40.0
"categorial_features = ['StoreType', 'Assortment', 'PromoInterval']",No,1,23.0
"plt.figure(figsize=(20,16))
for i,country in enumerate(list_countries):
    Fatal_diff=Fatal_pivot[(Fatal_pivot[country]>0)][country].diff().fillna(0)
    Fatal_diff=Fatal_diff[Fatal_diff>0]
    plt.subplot(3,4,i+1)
    Fatal_diff.plot(color=colors[i],label=country.upper(),lw=5)
    plt.xticks(rotation=60)
    plt.title('Number of daily new Fatalities  in {}'.format(country.upper()))
    plt.legend(title='Country')
plt.tight_layout()
",No,5,33.0
"from sklearn.preprocessing import OneHotEncoder, LabelEncoder

for p in categorial_features:
    X_int = LabelEncoder().fit_transform(df_store[p].values.astype(str)).reshape(-1,1)
    ohe_feat = OneHotEncoder(sparse=False).fit_transform(X_int)
    tmp = pd.DataFrame(ohe_feat, columns=['{0}='.format(p) + str(i) for i in df_store[p].unique()], 
                       index=df_store.index,
                       dtype=int)
    df_store = pd.concat([df_store, tmp], axis=1)
    df_store = df_store.drop(p, axis=1)",No,4,8.0
"for p in ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']:
    df_store.loc[:, p] = (df_store[p] - df_store[p].mean()) / df_store[p].std()",No,4,18.0
"# Understanding New cases confirmation variations on daily basis
plt.figure(figsize=(20,16))
for i,country in enumerate(list_countries):
    plt.subplot(4,3,i+1)
    train_df[(train_df['Country_Region']==country)&(train_df['ConfirmedCases']!=0)].groupby('Date')['ConfirmedCases'].sum().diff().diff().plot(color=colors[i])
    plt.ylabel('Difference in Daily reporting cases ')
    plt.title('Variation of {}'.format(country),va='bottom')
plt.suptitle('Variation in number of confirmed cases on daily basis',fontsize=24,va='baseline')",No,5,33.0
"plt.figure(figsize=(16,8))
plt.title('Confirmed Cases trend from first day of incidence')
for i,country in enumerate(list_countries):
    confirm_group=train_df[(train_df['Country_Region']==country)&train_df['ConfirmedCases']!=0].groupby('Date').agg({'ConfirmedCases':['sum']})
    confirm_value=[j for j in confirm_group.ConfirmedCases['sum'].values]
    plot_value=confirm_value[0:60]
    plt.plot(plot_value,color=colors[i],label=country,lw=2)
    plt.legend(title='Countries')",No,5,33.0
"from sklearn.manifold import TSNE

model = TSNE()
arr = model.fit_transform(df_store.fillna(0))
plt.scatter(arr[:, 0], arr[:, 1])",No,3,33.0
"plt.figure(figsize=(16,10))
plt.title('Fatalities trend from first day of incidence')
for i,country in enumerate(list_countries):
    fatal_group=train_df[(train_df['Country_Region']==country)&train_df['ConfirmedCases']!=0].groupby('Date').agg({'Fatalities':['sum']})
    fatal_value=[j for j in fatal_group.Fatalities['sum'].values]
    plot_value=fatal_value[0:60]
    plt.plot(plot_value,color=colors[i],label=country,lw=2)
    plt.legend(title='Countries')",No,5,33.0
"from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

scores = []
ns = list(range(2, 10)) + list(range(10, 30, 5))
for n in ns:
    agc = AgglomerativeClustering(n_clusters=n)
    store_cluster = agc.fit_predict(df_store.fillna(0)).reshape(-1,1)
    scores.append(silhouette_score(df_store.fillna(0), store_cluster.ravel()))
plt.plot(ns, scores)",No,2,6.0
"agc = AgglomerativeClustering(n_clusters=9)
store_cluster = agc.fit_predict(df_store.fillna(1))
store_cluster.shape",No,3,7.0
"from sklearn.manifold import TSNE

model = TSNE()
arr = model.fit_transform(df_store.fillna(1))
plt.scatter(arr[:, 0], arr[:, 1], c=store_cluster)",No,3,56.0
"print(df_train.shape)
df_train.head()",No,3,45.0
df_train.StateHoliday.unique(),No,5,57.0
"df_train.replace({'StateHoliday': {0: '0'}}, inplace=True)
df_train.StateHoliday.unique()",No,5,57.0
"SH_int = LabelEncoder().fit_transform(df_train.StateHoliday.values.astype(str)).reshape(-1,1)
ohe_feat = OneHotEncoder(sparse=False).fit_transform(SH_int)
tmp = pd.DataFrame(ohe_feat, columns=['SH='+ str(i) for i in df_train.StateHoliday.unique()], 
                   index=df_train.index,
                   dtype=int)
df_train = df_train.drop('StateHoliday', axis=1)
df_train = pd.concat([df_train, tmp], axis=1)",No,2,8.0
"ohe_feat = OneHotEncoder(sparse=False).fit_transform(df_train.DayOfWeek.values.reshape(-1,1))
tmp = pd.DataFrame(ohe_feat, columns=['DayOfWeek=' + str(i) for i in df_train.DayOfWeek.unique()], 
                   index=df_train.index,
                   dtype=int)
df_train = df_train.drop('DayOfWeek', axis=1)
df_train = pd.concat([df_train, tmp], axis=1)",No,3,8.0
"print(df_train.shape, df_train.columns)",No,3,8.0
"df_train['label'] = pd.Series([store_cluster[ind - 1] for ind in df_train.index],
                              index=df_train.index)

y_train = df_train[df_train.Open != 0].Sales.values
X_train = df_train[df_train.Open != 0].drop(['Date', 'Sales', 'Customers'], axis=1).values",No,2,58.0
"from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score

rlnrs = {}
for i, c in enumerate(np.unique(store_cluster)):
    df_c = df_train[df_train.label == c]
    if df_c.shape[0] == 0:
        continue
    X_c = df_c.drop(['Date', 'Sales', 'Customers'], axis=1).values
    y_c = df_c.Sales.values
    rlnr = Ridge()
    rlnr.fit(X_c, y_c)
    rlnrs.update({c: rlnr})
    print(c, rlnr.score(X_c, y_c))",No,3,21.0
"df_test = pd.read_csv('../input/test.csv', index_col='Id')",No,2,7.0
print(df_test.Open.unique()),No,5,57.0
"df_test.loc[df_test.Open == 0, 'Sales'] = 0
df_test.Open = df_test.loc[:, 'Open'].fillna(1)",No,5,17.0
"df_test.replace({'StateHoliday': {0: '0'}}, inplace=True)

SH_int = LabelEncoder().fit_transform(df_test.StateHoliday.values.astype(str)).reshape(-1,1)
ohe_feat = OneHotEncoder(sparse=False).fit_transform(SH_int)
tmp = pd.DataFrame(ohe_feat, columns=['SH='+ str(i) for i in df_test.StateHoliday.unique()], 
                   index=df_test.index,
                   dtype=int)
df_test = df_test.drop('StateHoliday', axis=1)
df_test = pd.concat([df_test, tmp], axis=1)
df_test['SH=b'] = 0
df_test['SH=c'] = 0

ohe_feat = OneHotEncoder(sparse=False).fit_transform(df_test.DayOfWeek.values.astype(str).reshape(-1,1))
tmp = pd.DataFrame(ohe_feat, columns=['DayOfWeek=' + str(i) for i in df_test.DayOfWeek.unique()], 
                   index=df_test.index,
                   dtype=int)
df_test = df_test.drop('DayOfWeek', axis=1)
df_test = pd.concat([df_test, tmp], axis=1)
    
df_test['label'] = pd.Series([store_cluster[ind - 1] for ind in df_test.Store],
                              index=df_test.index)

df_test = df_test.fillna(0)

X_test = df_test.drop(['Store', 'Date'], axis=1).values",No,3,57.0
"plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
train_df.groupby('Date')['ConfirmedCases'].sum().plot(color='blue')
plt.ylabel('Number of Confirmed Cases')
plt.title('Confirmed Cases worldwide trend')

plt.subplot(1,2,2)
train_df.groupby('Date')['Fatalities'].sum().plot(color='r')
plt.ylabel('Number of Fatalities')
plt.title(""Fatalities worldwide trend"")

plt.tight_layout()'",No,5,33.0
"# Confirmed Cases and Fatalities without China's data
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
train_df[(train_df['Country_Region']!='China')&(train_df['ConfirmedCases']!=0)].groupby('Date')['ConfirmedCases'].sum().plot(color='blue')
plt.ylabel('Number of Confirmed Cases')
plt.title('Confirmed Cases worldwide trend(without China)')

plt.subplot(1,2,2)
train_df[(train_df['Country_Region']!='China')&(train_df['Fatalities']!=0)].groupby('Date')['Fatalities'].sum().plot(color='red')
plt.ylabel('Number of Fatalities')
plt.title(""Fatalities worldwide trend(without China)"")
plt.tight_layout()
'",No,5,33.0
countries=train_df['Country_Region'].unique(),No,5,57.0
"country_list=[]
confirmation_list=[]
list_fatality=[]
for country in countries:
    country_list.append(country)
    confirm_country=train_df[train_df.Country_Region==country].groupby('Date')['ConfirmedCases'].sum().max()
    confirmation_list.append(confirm_country)
    fatal_country=train_df[train_df.Country_Region==country].groupby('Date')['Fatalities'].sum().max()
    list_fatality.append(fatal_country)
max_dict={'Country':country_list,'ConfirmedCases':confirmation_list,'Fatalities':list_fatality}
map_df=pd.DataFrame.from_dict(max_dict)",Yes,2,8.0
map_df,No,5,41.0
code_df=pd.read_csv('../input/countrycodes/country-codes.csv'),No,5,45.0
"code_df=code_df[['ISO3166-1-Alpha-3','CLDR display name']]",No,5,10.0
"map_df=map_df.merge(code_df,left_on='Country',right_on='CLDR display name')",No,5,32.0
"map_df.drop('CLDR display name',axis=1,inplace=True)",No,5,10.0
"map_df.rename(columns={'ISO3166-1-Alpha-3':'Country Code'},inplace=True)",No,5,61.0
"
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)

data=go.Choropleth(
    locations=map_df['Country Code'], # Spatial coordinates
    z = map_df['ConfirmedCases'], # Data to be color-coded,
    colorscale = 'Reds',
    text=map_df['Country'],
    colorbar_title = ""Number of Confirmed Cases"",)

fig=go.Figure(data)

fig.update_layout(
    title='Covid-19 Confirmed Cases',
           geo=dict(showframe=False,
                   projection={'type':'robinson'}))


iplot(fig)'",No,5,33.0
test_df['Date']=pd.to_datetime(test_df['Date']),No,5,16.0
"test_df['Province_State']=test_df.drop('Province_State',axis=1)",No,5,10.0
train_df=train_df.reset_index(),No,2,10.0
"def error_metrics(model, predictions, y_test):
    print(""Model: "", model)
    # The mean squared error
    print(""--Mean squared error: %.2f"" % mean_squared_error(y_test, predictions))
    # RMS
    print('--Root Mean Squared Error: %.2f' % np.sqrt(metrics.mean_squared_error(y_test, predictions)))
    # Explained variance score: 1 is perfect prediction
    print('--Variance score: %.2f' % r2_score(y_test, predictions))'",No,5,84.0
"# Take a look at some of the results
def inspect_df(predictions, y_test):
    true_vs_pred = np.vstack((predictions, y_test))
    true_df = pd.DataFrame(true_vs_pred)
    true_df = true_df.transpose()
    true_df.columns = [""Predicted"", ""Actual""]
    return true_df",No,5,12.0
"from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style=""display:inline""'),raw=True)'",No,5,53.0
"ridge_pred, y_test = linear_models(x, y, ""ridge"")
lasso_pred, y_test = linear_models(x, y, ""lasso"")
xgb_pred, y_test   = linear_models(x, y, ""xgb"")
lgb_pred, y_test   = linear_models(x, y, ""catboost"")",No,5,53.0
"error_metrics(""Ridge"", ridge_pred, y_test)
error_metrics(""Lasso"", lasso_pred, y_test)
error_metrics(""xgboost regression"", xgb_pred, y_test)
error_metrics(""catboost regression"", lgb_pred, y_test)",No,5,49.0
"X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 42)

model = CatBoostRegressor(random_state = 42)

""""""grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        #'num_leaves' : [5, 15, 31, 60],
        'bagging_temperature' : [0, 1, 5, 10]}

grid_search_model = model.grid_search(grid,
                                      X=X_train, 
                                      y=y_train,
                                      cv = 5
                                        )""""""
'",No,3,13.0
"model = CatBoostRegressor(num_leaves = 31, bagging_temperature= 0, depth = 6, l2_leaf_reg = 1, learning_rate = 0.03).fit(X_train, y_train)
predictions = model.predict(X_test)
error_metrics(""Catboost regression"", predictions, y_test)",No,3,7.0
print(model.get_feature_importance(prettified = True)),No,3,79.0
test.columns,No,5,71.0
"submit = pd.DataFrame()
submit['ForecastId'] = test['ForecastId']
test.drop([""ForecastId""], axis = 1, inplace = True)'",No,4,12.0
submit,No,5,41.0
fatilities = model.predict(test),No,5,48.0
"model = CatBoostRegressor(num_leaves = 31, bagging_temperature= 0, depth = 4, l2_leaf_reg = 5, learning_rate = 0.1).fit(X_train, y_train)
predictions = model.predict(X_test)
error_metrics(""Catboost regression"", predictions, y_test)",No,4,79.0
confirmedCases = model.predict(test),Yes,5,48.0
"submit['ConfirmedCases'] = confirmedCases
submit['Fatalities'] = fatilities",No,5,55.0
"submit.to_csv('submission.csv',index=False)",No,5,25.0
"import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression",No,5,22.0
"test = pd.read_csv(""../input/test.csv"",parse_dates=[3],index_col=""Id"",dtype={""StateHoliday"":np.str})
train = pd.read_csv(""../input/train.csv"",parse_dates=[2],dtype={""StateHoliday"":np.str})
print(test.dtypes) # Kaggle doesn't seem to support Python 2.7. Oh well.
print(train.dtypes)'",No,4,45.0
"set(test.Store.values).issubset(train.Store.values)
# if true, I can predict sales per store, not the whole model with store info joined",No,5,53.0
"train = train.loc[train.Sales > 0]
# Any day and store with 0 sales is ignored in scoring => no reason to predict them, either",No,5,14.0
"def prepare(df):
    # transform the date into something human-meaningful
    df['Year'] = pd.DatetimeIndex(df.Date).year
    df['Month'] = pd.DatetimeIndex(df.Date).month
    df['Day'] = pd.DatetimeIndex(df.Date).day
    # encode StateHolidays into numbers
    # Since there are only 'a' state holidays in test set, I can probably map a, b, c into 1
    df[df.StateHoliday != '0'] = 1
    df.StateHoliday = pd.to_numeric(df.StateHoliday)
    return df",No,5,21.0
"train = prepare(train);
test = prepare(test);
print(train.dtypes)
print(test.dtypes)",No,4,70.0
"# Curses! NA! Foiled again!
test.iloc[pd.isnull(test).any(1).nonzero()]",No,5,14.0
"test_nona = test.dropna().copy()
stores = set(test_nona.Store.values)
test_nona['Sales'] = 0 # create a column to be filled",No,5,17.0
"columns = ['DayOfWeek','Open','Promo','SchoolHoliday','Year','Month','Day','StateHoliday']
# Customers are not present in test, not worth using
for store in stores: # takes *FOREVER* to run
    # pandas throws ""IndexingError: Unalignable boolean Series key provided"" if I index df directly
    # well, it must think it's so clever
    train_store_indices = (train.Store.values==store)
    
    y_train = train.Sales.values[train_store_indices]
    X_train = train[columns].values[train_store_indices]
    
    model = LinearRegression(normalize=True,n_jobs=-1).fit(X_train,y_train)
    
    test_store_indices = (test_nona.Store==store)
    X_test = test_nona[columns].values[test_store_indices]
    
    test_nona.Sales.values[test_store_indices] = model.predict(X_test)'",No,2,27.0
"test['Sales'] = test_nona.Sales
test = test.fillna(0) # we didn't predict some stores with NAs, tell Kaggle to ignore them",No,5,17.0
"test[['Sales']].to_csv(""submission.csv"")'",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"import sklearn.cross_validation as cv
import sklearn.preprocessing as preprocessing
import sklearn.feature_extraction as fe
import sklearn.ensemble as es",No,5,22.0
"b""store = pd.read_csv('../input/store.csv')\ntrain = pd.read_csv('../input/train.csv',low_memory=False) #  low_memory\ntest = pd.read_csv('../input/test.csv')""",No,5,45.0
"test.fillna(1, inplace=True)
train = train[train[""Open""] != 0]",No,4,17.0
"train = pd.merge(train,store,on='Store')
test = pd.merge(test,store,on='Store')",No,5,32.0
"sale_means = train.groupby('Store').mean().Sales
sale_means.name = 'Sales=_Means'

train = train.join(sale_means,on='Store')
test = test.join(sale_means,on='Store')",No,2,32.0
"y = train.Sales.tolist()

train_ = train.drop(['Date','Sales','Store','Customers'],axis=1).fillna(0)

train_dic = train_.fillna(0).to_dict('records')

test_dic = test.drop([""Date"",""Store"",""Id""],axis=1).fillna(0).to_dict('records')'",No,3,10.0
"dv = fe.DictVectorizer()
X = dv.fit_transform(train_dic)
Xo = dv.transform(test_dic)",No,5,8.0
"maxmin = preprocessing.MinMaxScaler()
X = maxmin.fit_transform(X.toarray())
Xo = maxmin.transform(Xo.toarray())",No,5,18.0
"clf = es.RandomForestRegressor(n_estimators=25)
clf.verbose = True
clf.n_jobs = 8
clf",No,5,4.0
"clf.fit(Xtrain,Ytrain)
print (""Training Score :"" + str(clf.score(Xtrain,Ytrain)))
print (""Test Score : "" + str(clf.score(Xtest,Ytest)) )",No,3,7.0
"Yresult = clf.predict(Xtest)
Yresult = np.array(Yresult)

Ytest = np.array(Ytest)",No,5,48.0
"result = clf.predict(Xo)
output = pd.DataFrame(test.Id).join(pd.DataFrame(result,columns=['Sales']))
output.to_csv('output.csv',index=False)",No,4,25.0
"import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_style('whitegrid')
%matplotlib inline

# 
df_train = pd.read_csv(""../input/train.csv"")
df_store = pd.read_csv(""../input/store.csv"")
df_test = pd.read_csv(""../input/test.csv"")

#     
df_train['Year'] = df_train['Date'].apply(lambda x: int(x[:4]))
df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7]))
df_train.head()'",No,3,22.0
"cust_sales = pd.DataFrame()

cust_sales['Customers'] = df_train['Customers']
cust_sales['Sales']     = df_train['Sales']
correlation_matrix = cust_sales.corr().abs()

plt.subplots(figsize=(13, 9))
sns.heatmap(correlation_matrix,annot=True)",No,3,33.0
"df_train[""HolidayBin""] = df_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})

sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = df_train,
                   size = 4, kind =""box"", palette =""muted"")
sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = df_train,
                   size = 4, kind =""box"", palette =""muted"")
sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = df_train,
                   size = 4, kind =""box"", palette =""muted"")'",No,5,33.0
"#  
df_train['StateHoliday'] = df_train['StateHoliday'].replace(0, '0')
df_train[""HolidayBin""]   = df_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})

sns.factorplot(x =""Year"", y =""Sales"", hue =""StateHoliday"", data = df_train, 
               size = 6, kind =""bar"", palette =""muted"")'",Yes,3,16.0
"import spacy
nlp = spacy.blank(""en"")

def get_token_num_by_offset(s, offset):
  s_pre = s[:offset]
  return len(spacy_tok.tokenizer(s_pre))

# note that 'xxunk' is not special in this sense
special_tokens = ['xxbos','xxfld','xxpad', 'xxmaj','xxup','xxrep','xxwrep']


def adjust_token_num(processed, token_num):
  """"""
  As fastai tokenizer introduces additional tokens, we need to adjust for them.
  """"""
  counter = -1
  do_unrep = None
  for i, token in enumerate(processed):
    if token not in special_tokens:
      counter += 1
    if do_unrep:
      do_unrep = False
      if processed[i+1] != ""."":
        token_num -= (int(token) - 2) # one to account for the num itself
      else:  # spacy doesn't split full stops
        token_num += 1
    if token == ""xxrep"":
      do_unrep = True
    if counter == token_num:
      return i
  else:
    counter = -1
    for i, t in enumerate(processed):
      if t not in special_tokens:
        counter += 1
      print(i, counter, t)
    raise Exception(f""{token_num} is out of bounds ({processed})"")'",Yes,3,8.0
"def dataframe_to_tensors(df, max_len=512):
  # offsets are: pron_tok_offset, a_tok_offset, a_tok_right_offset, b_tok_offset, b_tok_right_offset
  offsets = list()
  labels = np.zeros((len(df),), dtype=np.int64)
  processed = list()
  for i, row in tqdm(df.iterrows()):
    try:
      text = row[""Text""]
      a_offset = row[""A-offset""]
      a_len = len(nlp(row[""A""]))
      
      b_offset = row[""B-offset""]
      b_len = len(nlp(row[""B""]))

      pron_offset = row[""Pronoun-offset""]
      is_a = row[""A-coref""]
      is_b = row[""B-coref""]
      a_tok_offset = get_token_num_by_offset(text, a_offset)
      b_tok_offset = get_token_num_by_offset(text, b_offset)
      a_right_offset = a_tok_offset + a_len - 1
      b_right_offset = b_tok_offset + b_len - 1
      pron_tok_offset = get_token_num_by_offset(text, pron_offset)
      tokenized = tokenizer.process_text(text, spacy_tok)[:max_len]
      tokenized = [""xxpad""] * (max_len - len(tokenized)) + tokenized  # add padding
      a_tok_offset = adjust_token_num(tokenized, a_tok_offset)
      a_tok_right_offset = adjust_token_num(tokenized, a_right_offset)
      b_tok_offset = adjust_token_num(tokenized, b_tok_offset)
      b_tok_right_offset = adjust_token_num(tokenized, b_right_offset)
      pron_tok_offset = adjust_token_num(tokenized, pron_tok_offset)
      numericalized = vocab.numericalize(tokenized)
      processed.append(torch.tensor(numericalized, dtype=torch.long))
      offsets.append([pron_tok_offset, a_tok_offset, a_tok_right_offset, b_tok_offset, b_tok_right_offset])
      if is_a:
        labels[i] = 0
      elif is_b:
        labels[i] = 1
      else:
        labels[i] = 2
    except Exception as e:
      print(i)
      raise
  processed = torch.stack(processed)
  offsets = torch.tensor(offsets, dtype=torch.long)
  labels = torch.from_numpy(labels)
  return processed, offsets, labels",No,4,12.0
"train_ds = TensorDataset(*dataframe_to_tensors(test))
valid_ds = TensorDataset(*dataframe_to_tensors(val))
test_ds = TensorDataset(*dataframe_to_tensors(train))",No,4,13.0
"train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=32, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)",No,3,45.0
lm.freeze(),No,2,23.0
"encoder_hidden_sz = 400

device = torch.device(""cuda"")

class CorefResolver(nn.Module):
  def __init__(self, encoder, dropout_p=0.3):
    super(CorefResolver, self).__init__()
    self.encoder = encoder
    self.dropout = nn.Dropout(dropout_p)
    self.hidden2hidden = nn.Linear(encoder_hidden_sz * 2 + 1, 25)
    self.hidden2logits = nn.Linear(50, 3)
    self.relu = nn.ReLU()
    self.activation = nn.LogSoftmax(dim=1)
    self.loss = nn.NLLLoss()
    
  def forward(self, seqs, offsets, labels=None):
    encoded = self.dropout(self.encoder(seqs)[0][2])
    a_q = list()
    b_q = list()
    for enc, offs in zip(encoded, offsets):
      # extract the hidden states that correspond to A, B and the pronoun, and make pairs of those 
      a_repr = enc[offs[2]]
      b_repr = enc[offs[4]]
      a_q.append(torch.cat([enc[offs[0]], a_repr, torch.dot(enc[offs[0]], a_repr).unsqueeze(0)]))
      b_q.append(torch.cat([enc[offs[0]], b_repr, torch.dot(enc[offs[0]], b_repr).unsqueeze(0)]))
    a_q = torch.stack(a_q)
    b_q = torch.stack(b_q)
    # apply the same ""detector"" layer to both batches of pairs
    is_a = self.relu(self.dropout(self.hidden2hidden(a_q)))
    is_b = self.relu(self.dropout(self.hidden2hidden(b_q)))
    # concatenate outputs of the ""detector"" layer to get the final probability distribution
    is_a_b = torch.cat([is_a, is_b], dim=1)
    is_logits = self.hidden2logits(self.dropout(self.relu(is_a_b)))

    activation = self.activation(is_logits)
    if labels is not None:
      return activation, self.loss(activation, labels)
    else:
      return activation",Yes,3,23.0
enc = lm.model[0],No,3,4.0
resolver = CorefResolver(enc),No,2,53.0
resolver.to(device),No,3,23.0
"for param in resolver.encoder.parameters():
  param.requires_grad = False",No,4,23.0
"lr = 0.001

loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(resolver.parameters(), lr=lr)",No,4,4.0
from sklearn.metrics import classification_report,No,5,22.0
"def train_epoch(model, optimizer, train_dl, report_every=10):
  model.train()
  step = 0
  total_loss = 0
  
  for texts, offsets, labels in train_dl:
    texts, offsets, labels = texts.to(device), offsets.to(device), labels.to(device)
    step += 1
    optimizer.zero_grad()
    _, loss = model(texts, offsets, labels)
    total_loss += loss.item()
    
    loss.backward()
    optimizer.step()
    
    if step % report_every == 0:
      print(f""Step {step}, loss: {total_loss/report_every}"")
      total_loss = 0
      
def evaluate(model, optimizer, valid_dl, probas=False):
  probas = list()
  model.eval()
  predictions = list()
  total_loss = 0
  all_labels = list()
  with torch.no_grad():
    for texts, offsets, labels in valid_dl:
      texts, offsets, labels = texts.cuda(), offsets.cuda(), labels.cuda()
      preds, loss = model(texts, offsets, labels)
      total_loss += loss.item()
      probas.append(preds.cpu().detach().numpy())
      predictions.extend([i.item() for i in preds.max(1)[1]])
    
    
  print(f""Validation loss: {total_loss/len(valid_dl)}"")
  print()
  print(classification_report(valid_dl.dataset.tensors[2].numpy(), predictions))
  if probas:
    return total_loss, np.vstack(probas)
  return total_loss, predictions",Yes,4,2.0
"total_epoch = 0
best_loss = 1e6

for i in range(3):
  print(""Epoch"", i + 1)
  total_epoch += 1
  train_epoch(resolver, optimizer, train_dl) 
  loss, labels = evaluate(resolver, optimizer, valid_dl)
  if loss < best_loss:
    best_loss = loss
    print(f""Loss improved, saving {total_epoch}"")
    torch.save(resolver.state_dict(), data_path/""model_best.pt"")",No,5,7.0
"for param in resolver.encoder.parameters():
  param.requires_grad = True",No,4,59.0
"lr = 3e-4
optimizer = torch.optim.Adam(resolver.parameters(), lr=lr)",No,5,4.0
x = np.load('../input/prepare-for-submission/0.npy'),No,5,44.0
"from keras.models import load_model
import tensorflow as tf
def logloss(y, y_):
    return tf.losses.log_loss(y,y_)

model = load_model(mf, custom_objects={'logloss':logloss})",No,4,4.0
import gc,No,5,22.0
y1 = model.predict(x),No,5,27.0
"for i in range(6):
  print(""Epoch"", i + 1)
  total_epoch += 1
  train_epoch(resolver, optimizer, train_dl)
  loss, labels = evaluate(resolver, optimizer, valid_dl)
  if loss < best_loss:
    best_loss = loss
    print(f""Loss improved, saving {total_epoch}"")
    torch.save(resolver.state_dict(), data_path/""model_best.pt"")",No,4,2.0
"resolver.load_state_dict(torch.load(data_path/""model_best.pt""))",No,5,44.0
"loss, res = evaluate(resolver, optimizer, test_dl, True)
res_s = np.exp(res)  # don't forget that we have log-softmax outputs:
submission = pd.DataFrame(res_s, index=train[""ID""], columns=[""A"", ""B"", ""NEITHER""])
submission.to_csv(""submission.csv"", index=""id"")'",Yes,3,27.0
"import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

%matplotlib inline",No,5,23.0
"del x
gc.collect()",No,1,53.0
"x = np.load('../input/prepare-for-submission-2/1.npy')
y2 = model.predict(x)
del x
gc.collect()",No,2,27.0
"x = np.load('../input/prepare-for-submission-3/2.npy')
y3 = model.predict(x)
del x
gc.collect()",No,3,27.0
"y = np.concatenate([y1, y2, y3])",No,5,11.0
"b""df = pd.DataFrame()\ndf['ID'] = pd.read_csv('../input/gendered-pronoun-resolution/test_stage_2.tsv', delimiter='\\t')['ID']""",No,5,45.0
"df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
df_store = pd.read_csv('../input/store.csv')",No,5,45.0
df_s = pd.read_csv('../input/gendered-pronoun-resolution/sample_submission_stage_2.csv'),No,5,45.0
"df['A'] = y[:,0] #, 'B', 'NEITHER'",No,5,8.0
"df['B'] = y[:,1]
df['NEITHER'] = y[:,2]",No,5,8.0
"y = df_train[""Sales""].values",No,5,21.0
"df.to_csv('submission.csv', index=False)",No,5,25.0
"df_train['Year'] = df_train['Date'].apply(lambda x: int(x[0:4]))
df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7]))
df_train['Day'] = df_train['Date'].apply(lambda x: int(x[8:10]))
df_test['Year'] = df_test['Date'].apply(lambda x: int(x[0:4]))
df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7]))
df_test['Day'] = df_test['Date'].apply(lambda x: int(x[8:10]))",No,5,8.0
"b""import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n%matplotlib inline\n\nplt.style.use('ggplot')\nplt.rcParams['figure.figsize'] = (12,8)\n\n#    \nfont = {'family': 'Verdana',\n        'weight': 'normal'}\nplt.rc('font', **font)""",No,5,23.0
"data_train = pd.read_csv(""../input/train.csv"")
data_test = pd.read_csv(""../input/test.csv"")
data_store = pd.read_csv(""../input/store.csv"")",No,5,45.0
data_train.head(),No,5,41.0
"df_store.CompetitionDistance.fillna(value=0, inplace=True)
df_test.Open.fillna(value=0, inplace=True)
df_train.StateHoliday[df_train[""StateHoliday""] == 0] = ""0""",Yes,5,17.0
"print(df_train.shape)
print(df_test.shape)
print(df_store.shape)",No,5,58.0
"fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))

sns.barplot(x='Year', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='Year', y='Customers', data=df_train, ax=axis2)",No,5,75.0
data_train.shape,No,5,58.0
"df_train.query('Open == 1')[['Sales', 'Customers']].hist(bins=100, figsize=(13,7));",No,5,33.0
data_test.columns,No,5,71.0
"fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))
sns.barplot(x='Month', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='Month', y='Customers', data=df_train, ax=axis2)",No,5,75.0
data_store.head(n=3),No,5,41.0
"fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))
sns.barplot(x='DayOfWeek', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=df_train, ax=axis2)",No,5,75.0
"data_train.StateHoliday = data_train.StateHoliday.replace(0,'0')
data_test.StateHoliday = data_test.StateHoliday.replace(0,'0')
data_train.DayOfWeek.value_counts()",No,4,8.0
"df_train[['Sales', 'Customers']].corr()",No,3,41.0
"df_DayOfWeek = pd.get_dummies(df_train.DayOfWeek, prefix='DayOfWeek')
df_StateHoliday = pd.get_dummies(df_train.StateHoliday, prefix=""StateHoliday_"")
df_train = pd.concat([df_train, df_DayOfWeek, df_StateHoliday], axis=1)
del df_train[""Date""]
del df_train[""Day""]
del df_train[""Customers""]
del df_train[""DayOfWeek""]
del df_train[""Sales""]
del df_train[""StateHoliday""]'",Yes,4,12.0
"df_StoreType = pd.get_dummies(df_store.StoreType, prefix='StoreType_')
df_Assortment = pd.get_dummies(df_store.Assortment, prefix='Assortment_')
df_store = pd.concat([df_store, df_StoreType, df_Assortment], axis=1)
del df_store[""StoreType""]
del df_store[""Assortment""]
del df_store[""PromoInterval""]'",Yes,4,12.0
"df = pd.merge(df_train, df_store, how='left', on=['Store'])",No,5,32.0
"df.fillna(0, inplace=True)",No,5,17.0
"X = df.values[:,1:]",No,5,21.0
"parametrs = range(40, 241, 40)",No,5,5.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(n_jobs = 4), X[:20000],\\\n               y[:20000], 'n_estimators', parametrs, cv=5, scoring='r2', verbose=2)""",No,5,2.0
"scores_mean = scores.mean(axis=1)
scores_std = scores.std(axis=1)
tst_scr_mean = tst_scr.mean(axis=1)
tst_scr_std = tst_scr.std(axis=1)
plt.plot(parametrs, tst_scr_mean)
plt.fill_between(parametrs, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3)
plt.plot(parametrs, scores_mean)
plt.fill_between(parametrs, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)",No,5,35.0
"parametrs = range(3, 24)",No,5,5.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4), X[:20000], \\\n                                   y[:20000], 'max_features', parametrs, cv=3, scoring='r2', verbose=2)""",No,5,1.0
"data_train['Year'] = data_train['Date'].apply(lambda x: int(x[:4]))
data_train['Month'] = data_train['Date'].apply(lambda x: int(x[5:7]))
data_train.head()",No,4,8.0
"parametrs = range(4, 61, 4)",No,5,5.0
"average_sales_per_month = data_train.groupby('Month')[""Sales""].mean()
plt.figure(figsize=(8, 5))
average_sales_per_month.plot(legend=True, marker='o', title=""Average sales per month"")'",No,4,33.0
"b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4, max_features=16), X[:20000], \\\n                                   y[:20000], 'max_depth', parametrs, cv=3, scoring='r2', verbose=2)""",No,5,84.0
"average_sales_per_day = data_train.groupby('Date')[""Sales""].mean()
fig = plt.subplots(1,1, sharex=True, figsize=(18, 5))
average_sales_per_day.plot(legend=True, title=""Average Daily Sales"")'",No,3,33.0
"model = RandomForestRegressor(n_estimators=120, max_depth=20, max_features=16, n_jobs=4, verbose=2)",No,5,4.0
"model.fit(X, y)",No,5,7.0
idx = model.feature_importances_.argsort()[::-1],No,5,79.0
"ax = sns.barplot(x=model.feature_importances_[idx], y=df.drop('Store', axis=1).columns[idx])",No,5,79.0
"df_DayOfWeek = pd.get_dummies(df_test.DayOfWeek, prefix='DayOfWeek')
df_StateHoliday = pd.get_dummies(df_test.StateHoliday, prefix=""StateHoliday_"")
df_StateHoliday = pd.concat([df_StateHoliday, pd.DataFrame(columns=['StateHoliday__b', 'StateHoliday__c'])], axis = 1)'",Yes,3,20.0
"columns_corr = ['Sales', 'Customers', 'Promo', 'StateHoliday', 'SchoolHoliday']
data_train[columns_corr].corr(method='pearson')",No,5,40.0
"df_StateHoliday.fillna(0, inplace=True)",No,5,17.0
"data_train['StateHoliday'] = data_train['StateHoliday'].replace(0, '0')
data_train[""HolidayBin""] = data_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})
data_train.StateHoliday.unique()'",No,4,8.0
"del df_test[""Date""]
del df_test[""Day""]
del df_test[""DayOfWeek""]
del df_test[""StateHoliday""]
df_test = pd.concat([df_test, df_DayOfWeek, df_StateHoliday], axis=1)
test_df = pd.merge(df_test, df_store, how='left', on=['Store'])
test_df.fillna(0, inplace=True)'",Yes,4,12.0
"y_test_pred = model.predict(test_df.values[:,2:])",No,5,48.0
"average_customers_per_month = data_train.groupby('Month')['Customers'].mean()
average_sales_per_month = data_train.groupby('Month')['Sales'].mean()",No,4,60.0
"submission = pd.DataFrame({ ""Id"": test_df.Id, ""Sales"": y_test_pred.reshape(-1.1)})",No,5,12.0
"plt.figure(figsize=(6, 4))
plt.plot(average_sales_per_month)",No,5,33.0
"plt.figure(figsize=(6, 4))
plt.plot(average_customers_per_month)",No,5,33.0
"submission.to_csv(""rossman.csv"",index=False)",No,5,25.0
"import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
%matplotlib inline ",No,5,23.0
"total_customers_for_store =  data_train.groupby('Store')['Sales', 'Customers'].sum()",No,5,60.0
"data_total_customers_for_store = pd.DataFrame({'Sales':  total_customers_for_store['Sales'],
                                               'Customers': total_customers_for_store['Customers']}, 
                                              index = total_customers_for_store.index)",No,5,12.0
"storedf = pd.read_csv(""../input/store.csv"")
storedf = storedf [[""Store"", ""Assortment"", ""CompetitionDistance"",""Promo2""]]
storedf = storedf.set_index(""Store"")
storedf.CompetitionDistance = storedf.CompetitionDistance.fillna(storedf.CompetitionDistance.max())
storedf.head()",Yes,3,45.0
data_total_customers_for_store = data_total_customers_for_store.reset_index(),No,5,61.0
"average_sales_customers =  data_train.groupby('Store')['Sales', 'Customers'].mean()",No,5,60.0
"def f(traindf):
    #traindf = traindf[traindf.Open==1]
    traindf = traindf.join(storedf, on=""Store"")
    traindf['isWeekEnd'] = traindf.DayOfWeek>=5 
    traindf['Month'] = list (map (lambda x: int(x[5:7]), traindf.Date))
    traindf['Day'] = list (map (lambda x: int(x[8:]), traindf.Date))
    traindf['isWinter'] = np.logical_or (traindf.Month <= 2, traindf.Month == 12)
    traindf['isSpring'] = np.logical_and (traindf.Month >= 3, traindf.Month <= 5)
    traindf['isSummer'] = np.logical_and (traindf.Month >= 6, traindf.Month <= 8)
    traindf['isAutumn'] = np.logical_and (traindf.Month >= 9, traindf.Month <= 11)
    traindf['AssortmentA'] = traindf.Assortment=='a'
    traindf['AssortmentB'] = traindf.Assortment=='b'
    traindf['AssortmentC'] = traindf.Assortment=='c'
    traindf['isEndofMonth'] = traindf.Day >= 25
    traindf['isBeginofMonth'] = traindf.Day <= 10
    traindf['CompetitionDistance'] = traindf.CompetitionDistance
    del traindf [""Assortment""]
    del traindf [""StateHoliday""]
    del traindf [""SchoolHoliday""]
    del traindf [""Date""]
    del traindf [""Store""]
    del traindf [""DayOfWeek""]
    del traindf [""Month""]
    del traindf [""Day""]
    return traindf'",Yes,3,14.0
"data_average_sales_customers = pd.DataFrame({'Sales':  average_sales_customers['Sales'],
                                         'Customers': average_sales_customers['Customers']}, 
                                         index = average_sales_customers.index)

data_average_sales_customers = data_average_sales_customers.reset_index()

data_stores_average = data_average_sales_customers.join(data_store.set_index('Store'), on='Store')
data_stores_average.head()",Yes,4,12.0
"data_average_sales_customers = pd.DataFrame({'Sales':  average_sales_customers['Sales'],
                                         'Customers': average_sales_customers['Customers']}, 
                                         index = average_sales_customers.index)",No,5,12.0
"traindf = pd.read_csv(""../input/train.csv"", low_memory=False)
traindf = f(traindf)
traindf = traindf[traindf.Open == 1]
ytrain = traindf.Sales.values
del traindf [""Sales""]
del traindf [""Customers""]
del traindf [""Open""]

traindf.head()",Yes,4,45.0
data_average_sales_customers = data_average_sales_customers.reset_index(),No,5,84.0
"testdf = pd.read_csv(""../input/test.csv"")
testdf = f(testdf)
testdf.head()",Yes,4,45.0
"data_stores_average = data_average_sales_customers.join(data_store.set_index('Store'), on='Store')",No,5,32.0
"model = RandomForestRegressor(min_samples_leaf=2, max_depth=30, n_estimators=30)
%time model.fit(traindf.values, ytrain)",No,5,7.0
data_stores_average.head(n=3),No,5,41.0
"data_stores_new = data_total_customers_for_store.join(data_store.set_index('Store'), on='Store')",No,5,32.0
"average_store_type = data_stores_new.groupby('StoreType')['Sales', 'Customers', 'CompetitionDistance'].mean()",No,5,60.0
"y = model.predict(testdf.values[:, 2:])

df = pd.DataFrame([])
df['Sales'] = y
df['Sales'][testdf.Open == 0] = 0
df = df.set_index(testdf.Id)
pd.DataFrame.to_csv(df, 'ans.csv')
df.head()",Yes,4,27.0
"Data_cmp = pd.DataFrame()
Data_cmp['Customers'] = average_store_type['Sales']
Data_cmp['Sales'] = average_store_type['Customers']
Data_cmp['Comp'] = average_store_type['CompetitionDistance']",No,4,12.0
"columns_corr = ['Sales', 'Customers', 'Comp']
Data_cmp[columns_corr].corr(method='pearson')",No,5,40.0
"average_assort = data_stores_new.groupby('Assortment')['Sales', 'Customers'].mean()",No,5,60.0
"closed_store_data = data_test[""Id""][data_test[""Open""] == 0].values
data_train.StateHoliday = data_train.StateHoliday.replace(0,'0')
data_test.StateHoliday = data_test.StateHoliday.replace(0,'0')'",No,5,8.0
"data_train['Year'] = data_train['Date'].apply(lambda x: int(x[:4]))
data_train['Month'] = data_train['Date'].apply(lambda x: int(x[5:7]))
data_train[""HolidayBin""] = data_train.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,8.0
"del data_train['Date']
del data_train['StateHoliday']",No,5,10.0
"data_test['Year'] = data_test['Date'].apply(lambda x: int(x[:4]))
data_test['Month'] = data_test['Date'].apply(lambda x: int(x[5:7]))
data_test[""HolidayBin""] = data_test.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,5,8.0
"del data_test['Date']
del data_test['StateHoliday']",No,5,10.0
"data_test = data_test[data_test[""Open""] != 0]",No,5,14.0
data_test[data_test['Store'] == 1].head(),No,5,41.0
"arr_tmp = []
for i in data_test['Store']:
      arr_tmp.append(float(data_store['CompetitionDistance'][data_store['Store'] == i]))
data_test['CompetitionDistance'] = arr_tmp",No,3,8.0
"arr_tmp = []
for i in data_train['Store']:
      arr_tmp.append(float(data_store['CompetitionDistance'][data_store['Store'] == i]))
data_train['CompetitionDistance'] = arr_tmp
data_train['CompetitionDistance'] = data_train['CompetitionDistance'].fillna(data_train['CompetitionDistance'].mean())",No,3,8.0
"train_stores = dict(list(data_train.groupby('Store')))
test_stores = dict(list(data_test.groupby('Store')))",No,4,60.0
"from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV",No,5,22.0
"#     
result = pd.Series()
for i in test_stores:  
    store = train_stores[i]
    X_train = store.drop([""Sales"", ""Store"", ""Customers""],axis=1)
    Y_train = store[""Sales""]
    X_test  = test_stores[i].copy()   
    store_ind = X_test[""Id""]
    X_test.drop([""Id"",""Store""], axis=1,inplace=True)
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_train.mean())
    estimator = RandomForestRegressor(n_estimators=10, max_depth=13, criterion = 'mse')
    estimator.fit(X_train, Y_train)
    Y_pred = estimator.predict(X_test)
    result = result.append(pd.Series(Y_pred, index=store_ind))
result = result.append(pd.Series(0, index=closed_store_data))
result = pd.DataFrame({ ""Id"": result.index, ""Sales"": result.values})
result.to_csv('result_new.csv', index=False)'",Yes,4,25.0
"print(df_test.shape, df_test.columns)",Yes,3,17.0
"for c in rlnrs.keys():
    X_c = df_test[(df_test.label == c) & (df_test.Open != 0)].drop(['Store', 'Date', 'Sales'], axis=1).values
    df_test.loc[(df_test.label == c) & (df_test.Open != 0), 'Sales'] = rlnrs[c].predict(X_c)
df_test.Sales.mean(), df_test.Sales.min(), df_test.Sales.max()",Yes,2,16.0
"out = pd.DataFrame({
    ""Id"": df_test.index,
    ""Sales"": df_test.Sales.values
})
out.to_csv('submission.csv', index=False)'",No,2,58.0
"import numpy as np
import pandas as pd


from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_union, make_pipeline

%matplotlib inline",No,5,23.0
"df_train = pd.read_csv(""../input/train.csv"", parse_dates=[""Date""], date_parser=pd.to_datetime, low_memory=False)
df_test = pd.read_csv(""../input/test.csv"", parse_dates=[""Date""], date_parser=pd.to_datetime, low_memory=False)
df_store = pd.read_csv(""../input/store.csv"")",No,5,45.0
"train = df_train.merge(df_store)
test = df_test.merge(df_store)",No,5,32.0
"print(""train"")
print(""max: "", df_train.Date.min())
print(""min:"", df_train.Date.max())
print(""delta: "", df_train.Date.max() - df_train.Date.min())",No,5,40.0
"print(""test"")
print(""max: "", df_test.Date.min())
print(""min:"", df_test.Date.max())
print(""delta: "", df_test.Date.max() - df_test.Date.min())",No,5,40.0
"df_train.groupby(""DayOfWeek"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0
df_train[(df_train.DayOfWeek == 7) & (df_train.Open == 1)].Store.unique().shape[0],No,5,54.0
"df_train.groupby(""StateHoliday"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0
"df_train.groupby(""SchoolHoliday"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0
"df_train.groupby(""Promo"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0
train.PromoInterval.unique(),No,5,57.0
"def get_Promo2Active(df):
    months_map = {v:i+1 for i, v in enumerate([""Jan"", ""Feb"", ""Mar"", ""Apr"", ""May"", ""Jun"", 
                                               ""Jul"", ""Aug"", ""Sept"", ""Oct"", ""Nov"", ""Dec""])}
    def is_Promo2_active(row):
        if row.Promo2 == 0:
            return 0
        
        current_week, current_month, current_year = row.Date.week, row.Date.month, row.Date.year
        start_week, start_year = row.Promo2SinceWeek, row.Promo2SinceYear
        active_months = set([months_map[m] for m in row.PromoInterval.split("","")])
        has_started = (current_year == start_year and current_week >= start_week) or current_year > start_year
        return int(has_started and current_month in active_months)
                        
    return df.apply(is_Promo2_active, axis=1)",No,5,8.0
"def get_CompetitionActive(df):
    def is_competition_active(row):
        if np.isnan(row.CompetitionDistance):
            return 0
        
        if np.isnan(row.CompetitionOpenSinceMonth) and np.isnan(row.CompetitionOpenSinceYear):
            return 1
        
        current_month, current_year = row.Date.month, row.Date.year
        opened_month, opened_year = row.CompetitionOpenSinceMonth, row.CompetitionOpenSinceYear
        
        return int((current_year == opened_year and current_month >= opened_month) or current_year > opened_year)
        
        
    return df.apply(is_competition_active, axis=1)",No,5,8.0
"train[""Promo2Active""] = get_Promo2Active(train)
train[""CompetitionActive""] = get_CompetitionActive(train)",No,5,8.0
"train[train.Promo2 == 1].groupby(""Promo2Active"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,60.0
"train.groupby(""CompetitionActive"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0
"# we already have these, but for the sake of consistence, let's do it again
train = df_train.merge(df_store)
test = df_test.merge(df_store)

train[""Promo2Active""] = get_Promo2Active(train)
train[""CompetitionActive""] = get_CompetitionActive(train)

test[""Promo2Active""] = get_Promo2Active(test)
test[""CompetitionActive""] = get_CompetitionActive(test)'",No,4,8.0
"train[""DayOfYear""] = train.Date.apply(lambda x: x.timetuple().tm_yday)
test[""DayOfYear""] = test.Date.apply(lambda x: x.timetuple().tm_yday)",No,5,16.0
"min_date = train.Date.min() # we know that all test data happened later
def date_to_day_number(df):
    return (df.Date - min_date).apply(lambda x: x.days)",No,5,8.0
"train[""Day""] = date_to_day_number(train)
test[""Day""] = date_to_day_number(test)",No,5,8.0
"train.sort_values(""Day"", inplace=True)
test.sort_values(""Day"", inplace=True)",No,5,9.0
"def rmspe(y_true, y_pred):
    w = np.zeros(y_true.shape, dtype=float)
    ind = y_true != 0
    w[ind] = 1./ (y_true[ind]**2)
    return np.sqrt(np.mean(w * (y_true - y_pred)**2))
rmspe_scorer = make_scorer(rmspe, greater_is_better=False)",No,5,84.0
"train_baseline = train.copy()
train_baseline['Last_Week_Sales'] = train_baseline.groupby(""Store"")[""Sales""].shift()
train_baseline['Last_Week_Diff'] = train_baseline.groupby(""Store"")[""Last_Week_Sales""].diff()
train_baseline.dropna(inplace=True, subset=[""Last_Week_Sales"", ""Last_Week_Diff""])
train_baseline.head()'",No,2,60.0
"mean_error = []
for day in range(2, train_baseline.Day.max() + 1):
    val = train_baseline[train_baseline.Day == day]
    p = val.Last_Week_Sales.values
    error = rmspe(val.Sales.values, p)
    mean_error.append(error)
    
print('Mean Error = %.5f' % np.mean(mean_error))",No,2,60.0
"class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        """"""this would allow us to fit the model based on the X input.""""""
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)",No,5,20.0
"def prepare_pipeline(df):
    
    def get_DayOfWeek(df):
        return df[""DayOfWeek""]
    
    def get_Open(df):
        return df[[""Open""]]
    
    def get_Promo(df):
        return df[[""Promo""]]
    
    def get_StateHoliday(df):
        return df[""StateHoliday""]
    
    def get_SchoolHoliday(df):
        return df[[""SchoolHoliday""]]
    
    def get_StoreType(df):
        return df[""StoreType""]
    
    def get_Assortment(df):
        return df[""Assortment""]
    
    def get_Promo2Active(df):
        return df[[""Promo2Active""]]
    
    def get_CompetitionActive(df):
        return df[[""CompetitionActive""]]
    
    def get_CompetitionDistance(df):
        return df[[""CompetitionDistance""]]
    
    def get_DayOfYear(df):
        return df[""DayOfYear""]
    
    p = make_union(*[
        make_pipeline(FunctionTransformer(get_DayOfWeek, validate=False), 
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Open, validate=False),
                      Imputer(strategy=""most_frequent"")),
        make_pipeline(FunctionTransformer(get_Promo, validate=False)),
        make_pipeline(FunctionTransformer(get_StateHoliday, validate=False), 
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_SchoolHoliday, validate=False)),
        make_pipeline(FunctionTransformer(get_StoreType, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Assortment, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Promo2Active, validate=False)),
        make_pipeline(FunctionTransformer(get_CompetitionActive, validate=False)),
        make_pipeline(FunctionTransformer(get_CompetitionDistance, validate=False),
                      Imputer(),
                      StandardScaler()),        
        make_pipeline(FunctionTransformer(get_DayOfYear, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder())
    ])
    
    return p",No,5,4.0
pipeline = prepare_pipeline(train),No,3,4.0
"x_train, y_train = pipeline.fit_transform(train), train.Sales
x_test = pipeline.transform(test)",No,4,7.0
"params = {""boosting_type"" : [""gbdt""],
          ""learning_rate"": [0.1],
          ""n_estimators"": [200],
          ""objective"": [""regression""],
          ""reg_alpha"": [1.0],# [0.0, 0.5, 1.0], # no time for an actual CV on kaggle
          ""reg_lambda"": [1.0],# [0.0, 0.5, 1.0],
          ""random_state"": [0],
          ""n_jobs"": [-1]
         }",No,5,59.0
"gs = GridSearchCV(LGBMRegressor(), params, scoring=rmspe_scorer, cv=2, n_jobs=1)
gs.fit(x_train, y_train)",No,5,6.0
prediction = gs.predict(x_test),No,5,48.0
"pd.DataFrame({""Id"": test.Id, ""Sales"": prediction}).to_csv(""submission.csv"", sep="","", index=False)",No,5,25.0
"def prepare_pipeline_ts(df, min_shift, max_shift):
    
    def get_shifted_date(df, for_sales=False):
        return (df.Date.min() + pd.DateOffset(days_to_shift))
    
    def get_DayOfWeek(df):
        return df[""DayOfWeek""]
    
    def get_Open(df):
        return df[[""Open""]]
    
    def get_Promo(df):
        return df[[""Promo""]]
    
    def get_StateHoliday(df):
        return df[""StateHoliday""]
    
    def get_SchoolHoliday(df):
        return df[[""SchoolHoliday""]]
    
    def get_StoreType(df):
        return df[""StoreType""]
    
    def get_Assortment(df):
        return df[""Assortment""]
    
    def get_Promo2Active(df):
        return df[[""Promo2Active""]]
    
    def get_CompetitionActive(df):
        return df[[""CompetitionActive""]]
    
    def get_CompetitionDistance(df):
        return df[[""CompetitionDistance""]]
    
    def get_DayOfYear(df):
        return df[""DayOfYear""]
    
    def get_previous_sales(df):
        sales = df[[""Store"", ""Sales""]].copy()
        for day in range(min_shift, max_shift + 1):
            sales[""Last-{}_Day_Sales"".format(day)] = sales.groupby(""Store"")[""Sales""].shift(day)
            sales[""Last-{}_Day_Diff"".format(day)] = sales.groupby(""Store"")[""Last-{}_Day_Sales"".format(day)].diff()
        
        return sales.drop([""Store"", ""Sales""], axis=1)
    
    p = make_union(*[
        make_pipeline(FunctionTransformer(get_DayOfWeek, validate=False), 
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Open, validate=False),
                      Imputer(strategy=""most_frequent"")),
        make_pipeline(FunctionTransformer(get_Promo, validate=False)),
        make_pipeline(FunctionTransformer(get_StateHoliday, validate=False), 
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_SchoolHoliday, validate=False)),
        make_pipeline(FunctionTransformer(get_StoreType, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Assortment, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_Promo2Active, validate=False)),
        make_pipeline(FunctionTransformer(get_CompetitionActive, validate=False)),
        make_pipeline(FunctionTransformer(get_CompetitionDistance, validate=False),
                      Imputer(),
                      StandardScaler()),        
        make_pipeline(FunctionTransformer(get_DayOfYear, validate=False),
                      LabelEncoderPipelineFriendly(), 
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_previous_sales, validate=False), 
                      Imputer(),
                      StandardScaler())
    ])
    
    return p",No,5,53.0
"test_size = len(test)
min_shift = (test.Date.max() - test.Date.min()).days 
max_shift = 180
to_drop = len(train[train.Date < train.Date.min() + pd.DateOffset(max_shift)])",No,5,77.0
"import warnings
warnings.filterwarnings(""ignore"")
#Data Manipulation and Treatment
import numpy as np
import pandas as pd
from datetime import datetime
#Plotting and Visualizations
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from scipy import stats
import itertools
#Scikit-Learn for Modeling
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics",No,5,23.0
"def str_to_date(date):
    return datetime.strptime(date, '%Y-%m-%d').date()",No,5,16.0
"#The training Set
df_train = pd.read_csv(""../input/train.csv"",sep=',', parse_dates=['Date']
                       , date_parser=str_to_date,
                       low_memory = False)


#Additional Information on those stores 
df_store = pd.read_csv(""../input/store.csv""
                       , low_memory = False)'",No,5,45.0
df_train.head() ,No,5,41.0
df_train.tail(),No,5,41.0
"df_train.dtypes,print (""The Train dataset has {} Rows and {} Variables"".format(str(df_train.shape[0]),str(df_train.shape[1])))",No,3,40.0
df_store.tail(),No,5,41.0
"df_store.dtypes ,print (""The Store dataset has {} Rows (which means unique Shops) and {} Variables"".format(str(df_store.shape[0]),str(df_store.shape[1]))) 

",No,3,40.0
df_train.count(0)/df_train.shape[0] * 100,No,2,54.0
"class bert(nn.Module):
    def __init__(self, bert_path):
        super().__init__()
        BERT = BertModel.from_pretrained(bert_path, config = BertConfig.from_pretrained(bert_path, output_hidden_states = True))
        self.BERT = BERT
        self.fc = nn.Sequential(nn.BatchNorm1d(self.BERT.config.hidden_size * 3),
                                nn.Dropout(0.4),
                                nn.Linear(self.BERT.config.hidden_size * 3, 600),
                                nn.BatchNorm1d(600),
                                nn.Dropout(0.4),
                                nn.Linear(600, 600),
                                nn.BatchNorm1d(600),
                                nn.Dropout(0.4),
                                nn.Linear(600,3))
        
    def forward(self, token, at_mask, offsets, layer):
        out = self.BERT(token, attention_mask = at_mask)[2][layer]
        out_lst = []
        for j in range(out.shape[0]):
            out_lst.append(torch.stack([torch.tensor(out[j,offsets[j,0]]),torch.tensor(out[j,offsets[j,1]]),torch.tensor(out[j,offsets[j,2]])] , dim = 0) )
        out_lst = torch.stack([word_embedding for word_embedding in out_lst], dim = 0)
        out = out_lst.reshape(out_lst.shape[0], -1)
        out = self.fc(out)
        return out
        
def create_model(df_len,epoch_len):        
    model = bert(bert_path)
    criteria = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), eps = 1e-06, lr = 1e-4)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=df_len*epoch_len)
    return model, criteria, optimizer, scheduler",Yes,3,4.0
"epoch_len = 20
model, criteria, optimizer, scheduler = create_model(len(df_train), epoch_len)
set_trainable(model.BERT, False)
aaa = 0
for t in range(epoch_len):
    tot_loss = 0
    correct_train = 0
    val_loss = 0
    val_correct = 0
    model = model.train()
    
    if GPU:
        model = model.cuda()
    
    for item in tqdm(train_loader):
        
        token = item[0]
        at_mask = item[3]
        offsets = item[2]
        target = item[1]
        if GPU:
            token = token.cuda()
            at_mask = at_mask.cuda()
            target = target.cuda()
            offsets = offsets.cuda()
            
        output = model(token, at_mask, offsets, -2)
        loss = criteria(output, target)
        tot_loss += loss.item()
        correct_train += torch.sum(torch.max(torch.nn.functional.softmax(output, dim = 1), dim = 1)[1] == target)
        
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
    with torch.no_grad():
        model = model.eval()
        
        if GPU:
            model = model.cuda()            
        for item in tqdm(val_loader):
            token = item[0]
            at_mask = item[3]
            offsets = item[2]
            target = item[1]
            
            if GPU:
                token = token.cuda()
                at_mask = at_mask.cuda()
                offsets = offsets.cuda()
                target = target.cuda()
                
            output = model(token, at_mask, offsets, -2)
            val_correct += torch.sum(torch.max(torch.nn.functional.softmax(output, dim = 1), dim = 1)[1] == target)
        if val_correct > aaa:
            bst_model = model
            aaa = val_correct
    print(tot_loss, correct_train,""   "", val_correct,"" out of "", len(val_loader)*30)",No,4,7.0
"def predict(df, dataloader, model):
    tmp_array = np.zeros((len(df), 3))
    with torch.no_grad():
        model = model.eval()
        if GPU:
            model = model.cuda()
        
        j = 0
        for item in tqdm(dataloader):
            
            token = item[0]
            at_mask = item[2]
            offsets = item[1]

            if GPU:
                token = token.cuda()
                at_mask = at_mask.cuda()
                offsets = offsets.cuda()
            
            output = model(token, at_mask, offsets, -2)
            for zz in output.cpu():
                tmp_array[j] = zz
                j+=1
            
    return tmp_array",No,4,49.0
"a = predict(test_2, test_2_loader, bst_model)",No,5,48.0
"bla = test_2[['ID']].merge(pd.DataFrame(torch.nn.functional.softmax(torch.tensor(a), dim = 1).numpy()), left_index=True, right_index=True).set_index('ID')
bla.columns = ['A', 'B', 'NEITHER']
bla.to_csv('sbmsn2.csv')",No,4,25.0
data.columns,No,5,71.0
data.info(),No,5,40.0
data['Type'].unique(),No,5,57.0
"import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.boxplot(data.revenue)
plt.subplot(1,2,2)
sns.distplot(data.revenue, bins=20, kde=False)
plt.show()",No,5,33.0
"#City distribution
data[""City""].value_counts().plot(kind='bar')'",No,5,33.0
"data[""Type""].value_counts().plot(kind='bar')'",No,5,33.0
"data[""City Group""].value_counts().plot(kind='bar')'",No,5,33.0
"# Crrelation between revenue and feature (p)s
def numFeaturePlot():
    features=(data.loc[:,'P1':'P37']).columns.tolist()
    plt.figure(figsize=(35,18))
    j=1
    while j<len(features):
        col=features[j-1]
        plt.subplot(6,6,j)
        sorted_grp = data.groupby(col)[""revenue""].sum().sort_values(ascending=False).reset_index()
        x_val = sorted_grp.index
        y_val = sorted_grp['revenue'].values
        plt.scatter(x_val, y_val)
        plt.xticks(rotation=60)
        plt.xlabel(col, fontsize=20)
        plt.ylabel('Revenue', fontsize=20)
        j+=1    
    plt.tight_layout()
    plt.show()
numFeaturePlot()'",No,5,81.0
df_train=df_train.drop(df_train[(df_train.Open == 0) & (df_train.Sales == 0)].index),No,5,14.0
"print (""Our new training set has now {} rows "".format(df_train.shape[0]))",No,3,58.0
"df_train.Sales.describe() 
#we see here a minimum of 0 which means some stores even opened got 0 sales on some days.",No,5,40.0
"df_train=df_train.drop(df_train[(df_train.Open == 1) & (df_train.Sales == 0)].index)
df_train = df_train.reset_index(drop=True) ",No,3,14.0
"
fig, axes = plt.subplots(1, 2, figsize=(17,3.5))
axes[0].boxplot(df_train.Sales, showmeans=True,vert=False)
axes[0].set_xlim(0,max(df_train[""Sales""]+1000))
axes[0].set_title('Boxplot For Sales Values')
axes[1].hist(df_train.Sales, cumulative=False, bins=20)
axes[1].set_title(""Sales histogram"")
axes[1].set_xlim((min(df_train.Sales), max(df_train.Sales)))

{""Mean"":np.mean(df_train.Sales),""Median"":np.median(df_train.Sales)}


'",No,5,33.0
tst_model = torch.load('/kaggle/input/gendered-model/model1.pth')['model'],No,5,30.0
"avg       = df_train.groupby('Month')[""Customers""].mean()
avg_sales = df_train.groupby('Month')['Sales'].mean()
total_sales_customers =  df_train.groupby('Store')['Sales', 'Customers'].sum()
total_sales_customers.head()'",No,4,60.0
"df_total_sales_customers = pd.DataFrame({'Sales':  total_sales_customers['Sales'],
                                         'Customers': total_sales_customers['Customers']}, 
                                         index = total_sales_customers.index)

df_total_sales_customers = df_total_sales_customers.reset_index()
df_total_sales_customers.head()",No,5,12.0
"avg_sales_customers =  df_train.groupby('Store')['Sales', 'Customers'].mean()
avg_sales_customers.head()",No,4,60.0
"df_avg_sales_customers = pd.DataFrame({'Sales':     avg_sales_customers['Sales'],
                                       'Customers': avg_sales_customers['Customers']}, 
                                       index =      avg_sales_customers.index)

df_avg_sales_customers = df_avg_sales_customers.reset_index()

df_stores_avg = df_avg_sales_customers.join(df_store.set_index('Store'), on='Store')
df_stores_avg.head()",No,4,12.0
"df_stores_new = df_total_sales_customers.join(df_store.set_index('Store'), on='Store')
df_stores_new.head()",No,4,32.0
"b""#        b\n#   ,    ,     b     , \n#      \n\naverage_storetype = df_stores_new.groupby('StoreType')['Sales', 'Customers', 'CompetitionDistance'].mean()\n\nfig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,4))\nsns.barplot(average_storetype.index, average_storetype['Sales'], ax=axis1)\nsns.barplot(average_storetype.index, average_storetype['Customers'], ax=axis2)\nsns.barplot(average_storetype.index, average_storetype['CompetitionDistance'], ax=axis3)""",No,5,33.0
"b""#   ,    ,   \ncomp_sales_cust = pd.DataFrame()\n\ncomp_sales_cust['Customers'] = average_storetype['Customers']\ncomp_sales_cust['Sales']     = average_storetype['Sales']\ncomp_sales_cust['Comp']      = average_storetype['CompetitionDistance']\ncorr_matrix = comp_sales_cust.corr()\n\nplt.subplots(figsize=(13, 9))\nsns.heatmap(corr_matrix,annot=True)""",No,5,80.0
"b""#  \navg_assort = df_stores_new.groupby('Assortment')['Sales', 'Customers'].mean()\n\nfig, (axis1,axis2) = plt.subplots(1, 2, figsize=(15, 4))\nsns.barplot(avg_assort.index, avg_assort['Sales'],     ax=axis1)\nsns.barplot(avg_assort.index, avg_assort['Customers'], ax=axis2)""",No,5,33.0
"df_train = pd.read_csv(""../input/train.csv"")
df_store = pd.read_csv(""../input/store.csv"")
df_test  = pd.read_csv(""../input/test.csv"")",No,5,45.0
"closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values

df_train['Year']       = df_train['Date'].apply(lambda x: int(x[:4]))
df_train['Month']      = df_train['Date'].apply(lambda x: int(x[5:7]))
df_train[""HolidayBin""] = df_train.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})

del df_train['Date']
del df_train['StateHoliday']

df_train.head()'",No,3,8.0
"df_test['Year']       = df_test['Date'].apply(lambda x: int(x[:4]))
df_test['Month']      = df_test['Date'].apply(lambda x: int(x[5:7]))
df_test[""HolidayBin""] = df_test.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})

del df_test['Date']
del df_test['StateHoliday']

df_test.head()'",No,4,8.0
"df_test = df_test[df_test[""Open""] != 0]
df_test[df_test['Store'] == 1].head()'",No,5,14.0
"b""#   numpy\na = list()\nfor i in df_test['Store']:\n      a.append(float(df_store['CompetitionDistance'][df_store['Store'] == i]))\n\ndf_test['CompetitionDistance'] = a\ndf_test.head()""",No,5,8.0
"a = list()
for i in df_train['Store']:
      a.append(float(df_store['CompetitionDistance'][df_store['Store'] == i]))

df_train['CompetitionDistance'] = a
df_train['CompetitionDistance'] = df_train['CompetitionDistance'].fillna(df_train['CompetitionDistance'].mean())

df_train.head()",No,4,8.0
"b""# ,    \ndf_train['CompetitionDistance'] = np.log(df_train['CompetitionDistance'])\ndf_test ['CompetitionDistance'] = np.log(df_test ['CompetitionDistance'])""",No,5,8.0
"gs_ts = GridSearchCV(LGBMRegressor(), params, scoring=rmspe_scorer, cv=2, n_jobs=1)
gs_ts.fit(x_train_ts, y_train_ts)",No,5,6.0
"pd.DataFrame({""Id"": test.Id, ""Sales"": prediction_ts}).to_csv(""submission_ts.csv"", sep="","", index=False)",No,5,25.0
"# This method helps in understanding the correlation between the different features and the Revenue.
def featureCatPlot(col):
    
    plt.figure(figsize=(15,6))
    i=1
    if not data[col].dtype.name=='int64' and not data[col].dtype.name=='float64':
        plt.subplot(1,2,i)
        sns.boxplot(x=col,y='revenue',data=data)
        plt.xticks(rotation=60)
        plt.ylabel('Revenue')
        i+=1 
        plt.subplot(1,2,i)
        mean=data.groupby(col)['revenue'].mean()
        level=mean.sort_values().index.tolist()
        data[col]=data[col].astype('category')
        data[col].cat.reorder_categories(level,inplace=True)
        data[col].value_counts().plot()
        plt.xticks(rotation=60)
        plt.xlabel(col)
        plt.ylabel('Counts')       
        plt.show()",Yes,5,33.0
"featureCatPlot('City Group')
",No,5,33.0
"# Splitting 01/31/2018 as 01, 31, 2018
train_date=data['Open Date'].str.split('/', n = 2, expand = True)
data['month']=train_date[0]
data['days']=train_date[1]
data['year']=train_date[2]

test_date=test_data['Open Date'].str.split('/', n = 2, expand = True)
test_data['month']=test_date[0]
test_data['days']=test_date[1]
test_data['year']=test_date[2]
data['month']
",No,5,8.0
featureCatPlot('month'),No,5,33.0
"data.sort_values('revenue', ascending=False)[:20]",No,5,9.0
"top_6= data.sort_values('revenue', ascending=False)[:20]
plt.figure(figsize=(13,12))
plt.title(""The top 6 resturants"")
sns.barplot(x=top_6['City'], y=top_6['revenue'])

'",No,4,9.0
"best_month= data.sort_values('revenue', ascending=False)[:20]
",No,5,9.0
"plt.figure(figsize=(13,12))

sns.barplot(x=best_month['month'], y=best_month['revenue'])
plt.xticks(rotation=60)",No,5,75.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.filterwarnings(""ignore"")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,4,23.0
"sales = pd.read_csv(""../input/train.csv"")
sales.head()",No,5,45.0
sales.info(),No,5,40.0
"stores = pd.read_csv(""../input/store.csv"")
stores.head()",No,4,45.0
stores.info(),No,5,40.0
df_train.Customers.describe()    ,No,5,40.0
"#Number of stores in sales not in stores data

print(len(set(sales['Store']) - set(stores['Store'])))",No,5,72.0
"#Number of test stores not in train stores

print(len(set(test['Store']) - set(data['Store'])))",No,5,72.0
"
fig, axes = plt.subplots(1, 2, figsize=(17,3.5))
axes[0].boxplot(df_train.Customers, showmeans=True,vert=False)
axes[0].set_xlim(0,max(df_train[""Customers""]+100))
axes[0].set_title('Boxplot For Customer Values')
axes[1].hist(df_train.Customers, cumulative=False, bins=20)
axes[1].set_title(""Customers histogram"")
axes[1].set_xlim((min(df_train.Customers), max(df_train.Customers)))

{""Mean"":np.mean(df_train.Customers),""Median"":np.median(df_train.Customers)}'",No,3,33.0
"#Number of train stores not in test stores

print(len(set(data['Store']) - set(test['Store'])))",No,5,72.0
df_train[df_train.Customers>7000],No,5,14.0
"g = sns.distplot(data['Sales'])
g.set_title(""Data Distribution"")
'",No,5,33.0
"zero_sales = data[data['Sales']==0].copy()
data =  data[data['Sales']!=0].drop('Open', 1)",No,4,10.0
"fig, ax = plt.subplots (1,4, figsize=(20,4))
sns.barplot(['Size'], [len(zero_sales)], ax=ax[0])
sns.countplot('DayOfWeek', data=zero_sales, ax=ax[1])
sns.countplot('Open', data=zero_sales, ax=ax[2])
sns.countplot('Promo', data=zero_sales, ax=ax[3])
plt.tight_layout()",No,5,75.0
"plt.figure(figsize=(8,8))
sns.heatmap(data .corr(), cmap='coolwarm')",No,5,80.0
"data.fillna(0, inplace=True)

test.fillna(1, inplace=True)",No,5,17.0
"# Combining train and test data
# Decomposing date features

data['part'] = 'train'
test['part'] = 'test'
all_data = pd.concat([data, test], 0)[data.columns.tolist()+['Id']]
all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year
all_data['Day'] = all_data['Date'].dt.day
all_data['WeekOfYear'] = data.Date.dt.weekofyear
all_data['Quarter'] = data.Date.dt.quarter
all_data.sort_values('Date', inplace=True)",No,5,8.0
all_data.head(),No,5,41.0
"tmp = all_data[all_data['part']=='train']
sns.regplot('CompetitionOpen', 'Sales', data=tmp, ci=None)",No,5,33.0
"b""all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \\\n    (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0\nall_data['PromoOpen'] = all_data['PromoOpen'].apply(lambda x: x if x > 0 else 0)""",No,5,8.0
"tmp = all_data[all_data['part']=='train']
sns.regplot('PromoOpen', 'Sales', data=tmp, ci=None)",No,5,33.0
"df_store[pd.isnull(df_store.CompetitionDistance)] 
#rows with missing values for Competition Distance, only 3 rows with null which makes sense since 99.73% is filled",No,2,14.0
"df_store_check_distribution=df_store.drop(df_store[pd.isnull(df_store.CompetitionDistance)].index)
fig, axes = plt.subplots(1, 2, figsize=(17,3.5))
axes[0].boxplot(df_store_check_distribution.CompetitionDistance, showmeans=True,vert=False,)
axes[0].set_xlim(0,max(df_store_check_distribution.CompetitionDistance+1000))
axes[0].set_title('Boxplot For Closest Competition')
axes[1].hist(df_store_check_distribution.CompetitionDistance, cumulative=False, bins=30)
axes[1].set_title(""Closest Competition histogram"")
axes[1].set_xlim((min(df_store_check_distribution.CompetitionDistance), max(df_store_check_distribution.CompetitionDistance)))
{""Mean"":np.nanmean(df_store.CompetitionDistance),""Median"":np.nanmedian(df_store.CompetitionDistance),""Standard Dev"":np.nanstd(df_store.CompetitionDistance)}#That's what i thought, very different values, let's see why '",No,2,33.0
"df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace = True)",No,5,17.0
"df_store.CompetitionOpenSinceMonth.fillna(0, inplace = True)
df_store.CompetitionOpenSinceYear.fillna(0,inplace=True)",No,5,17.0
"df_store.Promo2SinceWeek.fillna(0,inplace=True)
df_store.Promo2SinceYear.fillna(0,inplace=True)
df_store.PromoInterval.fillna(0,inplace=True)",No,5,17.0
"#Left-join the train to the store dataset since .Why?
#Because you want to make sure you have all events even if some of them don't have their store information ( which shouldn't happen)
df_train_store = pd.merge(df_train, df_store, how = 'left', on = 'Store')
df_train_store.head() 
print (""The Train_Store dataset has {} Rows and {} Variables"".format(str(df_train_store.shape[0]),str(df_train_store.shape[1]))) 
'",No,4,32.0
df_train_store['SalesperCustomer']=df_train_store['Sales']/df_train_store['Customers'],No,5,8.0
df_train_store.head(),No,5,41.0
"fig, axes = plt.subplots(2, 3,figsize=(17,10) )
palette = itertools.cycle(sns.color_palette(n_colors=4))
plt.subplots_adjust(hspace = 0.28)
#axes[1].df_train_store.groupby(by=""StoreType"").count().Store.plot(kind='bar')
axes[0,0].bar(df_store.groupby(by=""StoreType"").count().Store.index,df_store.groupby(by=""StoreType"").count().Store,color=[next(palette),next(palette),next(palette),next(palette)])
axes[0,0].set_title(""Number of Stores per Store Type \
 Fig 1.1"")
axes[0,1].bar(df_train_store.groupby(by=""StoreType"").sum().Sales.index,df_train_store.groupby(by=""StoreType"").sum().Sales/1e9,color=[next(palette),next(palette),next(palette),next(palette)])
axes[0,1].set_title(""Total Sales per Store Type (in Billions) \
 Fig 1.2"")
axes[0,2].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").sum().Customers/1e6,color=[next(palette),next(palette),next(palette),next(palette)])
axes[0,2].set_title(""Total Number of Customers per Store Type (in Millions) \
 Fig 1.3"")
axes[1,0].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").Sales.mean(),color=[next(palette),next(palette),next(palette),next(palette)])
axes[1,0].set_title(""Average Sales per Store Type \
 Fig 1.4"")
axes[1,1].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").Customers.mean(),color=[next(palette),next(palette),next(palette),next(palette)])
axes[1,1].set_title(""Average Number of Customers per Store Type \
 Fig 1.5"")
axes[1,2].bar(df_train_store.groupby(by=""StoreType"").sum().Sales.index,df_train_store.groupby(by=""StoreType"").SalesperCustomer.mean(),color=[next(palette),next(palette),next(palette),next(palette)])
axes[1,2].set_title(""Average Spending per Customer in each Store Type \
 Fig 1.6"")
plt.show()'",No,3,33.0
"StoretypeXAssortment = sns.countplot(x=""StoreType"",hue=""Assortment"",order=[""a"",""b"",""c"",""d""], data=df_store,palette=sns.color_palette(""Set2"", n_colors=3)).set_title(""Number of Different Assortments per Store Type"")
df_store.groupby(by=[""StoreType"",""Assortment""]).Assortment.count()

",No,2,33.0
"df_train_store['Month']=df_train_store.Date.dt.month
df_train_store['Year']=df_train_store.Date.dt.year",No,3,8.0
"
sns.factorplot(data = df_train_store, x =""Month"", y = ""Sales"", 
               col = 'Promo', # per store type in cols
               hue = 'Promo2',
               row = ""Year""
              ,sharex=False)
'",No,4,33.0
"sns.factorplot(data = df_train_store, x =""Month"", y = ""SalesperCustomer"", 
               col = 'Promo', # per store type in cols
               hue = 'Promo2',
               row = ""Year""
              ,sharex=False)'",No,2,33.0
"sns.factorplot(data = df_train_store, x =""DayOfWeek"", y = ""Sales"",
                hue='Promo'
              ,sharex=False)'",No,2,33.0
"#33 Stores are opened on Sundays
print (""Number of Stores opened on Sundays:{}"" .format(df_train_store[(df_train_store.Open == 1) & (df_train_store.DayOfWeek == 7)]['Store'].unique().shape[0]))'",No,3,54.0
"df_train_store['CompetitionDist_Cat']=pd.cut(df_train_store['CompetitionDistance'], 5)",No,4,13.0
"df_train_store.groupby(by=""CompetitionDist_Cat"").Sales.mean(),df_train_store.groupby(by=""CompetitionDist_Cat"").Customers.mean()",No,5,60.0
"del df_train_store[""CompetitionDist_Cat""]",No,5,10.0
df_train_store['Day']=df_train_store.Date.dt.day,No,5,8.0
"del df_train_store[""Date""]",No,5,10.0
"df_train_store['StoreType'].isnull().any(),df_train_store['Assortment'].isnull().any(),df_train_store['StateHoliday'].isnull().any()
#No Null values we can proceed with the transformation",No,2,39.0
"df_train_store[""StoreType""].value_counts(),df_train_store[""Assortment""].value_counts(),df_train_store[""StateHoliday""].value_counts()",No,4,54.0
"df_train_store['StateHoliday'] = df_train_store['StateHoliday'].astype('category')
df_train_store['Assortment'] = df_train_store['Assortment'].astype('category')
df_train_store['StoreType'] = df_train_store['StoreType'].astype('category')
df_train_store['PromoInterval']= df_train_store['PromoInterval'].astype('category')",No,5,16.0
"df_train_store['StateHoliday_cat'] = df_train_store['StateHoliday'].cat.codes
df_train_store['Assortment_cat'] = df_train_store['Assortment'].cat.codes
df_train_store['StoreType_cat'] = df_train_store['StoreType'].cat.codes
df_train_store['PromoInterval_cat'] = df_train_store['PromoInterval'].cat.codes

",No,3,16.0
"df_train_store['StateHoliday_cat'] = df_train_store['StateHoliday_cat'].astype('float')
df_train_store['Assortment_cat'] = df_train_store['Assortment_cat'].astype('float')
df_train_store['StoreType_cat'] = df_train_store['StoreType_cat'].astype('float')
df_train_store['PromoInterval_cat'] = df_train_store['PromoInterval_cat'].astype('float')",No,5,16.0
"from argparse import Namespace

#There are 1115 stores. Select a  small sample to do experimentation on. Select all for full training.
num_sample_stores=1115

#The test set is 47 days. Normally use the last 47 days of the training data for validation. Se to 0 and use all data for traing when submitting to kaggle
valid_days=0

#Hyperparameters
s= Namespace( **{
    ""l1"":4497,
    ""l2"":2328,
    ""ps1"":0.2771132028380148,
    ""ps2"":0.15631474446268287,
    ""emb_drop"":0.14301109844119272,
    ""batchsize"":64,
    ""lrate"":0.0660858230905056,
    ""lrate_ratio"":9,
    ""wd"":0.17305139150930285,
    ""l1epoch"":4,
    ""l2epoch"":3,
    ""l3epoch"":8,
})
",No,5,59.0
"from pathlib import Path
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

from fastai import *
from fastai.tabular import * 


#display results
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
",No,5,22.0
"plotly.offline.init_notebook_mode(connected=False)
cf.go_offline()

%matplotlib inline
%reload_ext autoreload
%autoreload 2


pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 500)",No,5,23.0
!ls ../input/,No,5,88.0
"path=Path(""../input/rossmann-data-engineering/"")
traindf=pd.read_feather(path/""train.feather"")
testdf=pd.read_feather(path/""test.feather"")",No,5,44.0
"best_type= data.sort_values('revenue', ascending=False)

plt.figure(figsize=(13,12))

sns.barplot(x=best_type['Type'], y=best_type['revenue'])",No,4,9.0
data,No,5,41.0
"#Select size validation set based on valid_days variable 
from datetime import datetime, timedelta
valid_idx=traindata[traindata.Date>=(traindata.Date.max()- timedelta(days=valid_days))].index.tolist()",No,4,8.0
"#Convert datetime columns to int64 for traning
datecols=traindata.select_dtypes(include=""datetime"").columns.tolist()
traindata[datecols]=traindata[datecols].astype(""int64"")
testdf[datecols]=testdf[datecols].astype(""int64"")",No,5,16.0
"procs = [FillMissing, Categorify, Normalize]
dep_var = 'Sales'
#cont_names,cat_names= cont_cat_split(sample_train,dep_var=""Sales"")
cont_names=[
 'CompetitionDistance',
 'Week',
 'Day',
 'Dayofyear',
 'Elapsed',
 'ratio-sales-customer',
 'ratio-saturday-week',
 'ratio-sunday-week',
 'ratio-promo-nopromo',
 'Promo_thisweek',
 'Open_thisweek',
 'StateHolidayBool_thisweek',
 'SchoolHoliday_thisweek',
 'Promo_prevweek',
 'Open_prevweek',
 'StateHolidayBool_prevweek',
 'SchoolHoliday_prevweek',
 'Promo_nextweek',
 'Open_nextweek',
 'StateHolidayBool_nextweek',
 'SchoolHoliday_nextweek',
 'Promo2Days',
 'CompetitionDaysOpen',
 'trend',
 'trend_DE',
 'Max_Humidity',
 'Max_Wind_SpeedKm_h',
 'Mean_Humidity',
 'Mean_TemperatureC',
 'Max_TemperatureC_chnage',
 'Month_Sales_mean',
 'Year_Sales_mean',
 'Dayofweek_Sales_mean',
 'Dayofweek_promo_Sales_mean',
 'BeforeSchoolHoliday',
 'AfterSchoolHoliday',
 'BeforeClosed',
 'AfterClosed',
 'BeforePromo',
 'AfterPromo',
 'BeforeStateHolidayBool',
 'AfterStateHolidayBool',
 'Promo2ActiveMonthBool',
 'BeforePromo2ActiveMonthBool',
 'AfterPromo2ActiveMonthBool',
 'SchoolHoliday_fw',
 'StateHolidayBool_fw',
 'Promo_fw',
 'Closed_fw',
 'Promo2ActiveMonthBool_fw',
    'CompetitionOpenSince', 'Promo2Since'
]
cat_names=[
  'Store',  
  'DayOfWeek',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'Promo2',
 'PromoInterval',
 'Year',
 'Month',
 'Dayofweek',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start',
 'Promo2SinceYear',
 'Promo2Na',
 'Events',
'Fog',
 'Hail',
 'Rain',
 'Snow',
 'Thunderstorm',
 'Quarter',
 'CompetitionOpenNA',
 'CompetitionDistanceNA',
 'CompetitionOpenSinceYear',
  'State'
]
'",No,3,21.0
"max_log_y = np.log(np.max(traindata['Sales']))#*1.2
y_range = torch.tensor([0, max_log_y], device=defaults.device)",No,4,21.0
"databunch = (TabularList.from_df(traindata, path="""", cat_names=cat_names, cont_names=cont_names, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(testdf, path=path, cat_names=cat_names, cont_names=cont_names))
                .databunch())
databunch.batch_size=s.batchsize",No,4,13.0
"learn = tabular_learner(databunch, layers=[s.l1,s.l2], ps=[s.ps1,s.ps2], emb_drop=s.emb_drop, y_range=y_range, metrics=exp_rmspe)",No,5,4.0
learn.lr_find(),No,5,2.0
learn.recorder.plot(),No,5,35.0
"learn.fit_one_cycle(s.l1epoch, s.lrate, wd=s.wd)",No,5,7.0
"learn.fit_one_cycle(s.l2epoch, s.lrate/s.lrate_ratio, wd=s.wd)",No,5,7.0
"learn.fit_one_cycle(s.l3epoch, s.lrate/(s.lrate_ratio*s.lrate_ratio), wd=s.wd)",No,5,7.0
"valid_preds=learn.get_preds(DatasetType.Valid)
traindata[""SalesPreds""]=pd.Series(index=traindata.iloc[valid_idx].index,data=np.exp(valid_preds[0].numpy().T[0]))",No,4,48.0
"#Define error function
def rmspe_metric(act,pred):
       return np.sqrt(np.mean(((act-pred)/act)**2))",No,5,84.0
"rmspe_metric(traindata.Sales,traindata.SalesPreds)",No,5,28.0
"#Sort stores by how much error
store_rmspe=traindata.groupby([""Store""]).apply(lambda x:rmspe_metric(x.Sales,x.SalesPreds)).sort_values(ascending=False)",No,5,28.0
"store_rmspe.iplot(kind=""histogram"")",No,5,33.0
store_rmspe[:10],No,5,41.0
"t=traindata.set_index(""Date"")",No,5,61.0
"#Stores with most error
for store in store_rmspe.index[:4].tolist():
    t[t.Store==store][[""Sales"",""SalesPreds""]].iplot(kind=""bar"",barmode=""overlay"",title=""Store {}"".format(store))",No,5,33.0
"#Stores with least error
for store in store_rmspe.index[-4:].tolist():
    t[t.Store==store][[""Sales"",""SalesPreds""]].iplot(kind=""bar"",barmode=""overlay"",title=""Store {}"".format(store))",No,5,33.0
"test_preds=learn.get_preds(DatasetType.Test)
testdf[""Sales""]=np.exp(test_preds[0].data).numpy().T[0]
testdf[[""Id"",""Sales""]]=testdf[[""Id"",""Sales""]].astype(""int"")
testdf[[""Id"",""Sales""]].to_csv(""rossmann_submission.csv"",index=False)",No,3,25.0
"import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

from xgboost import  XGBRegressor",No,5,22.0
"train_data = pd.read_csv(""../input/train.csv"",low_memory= False)
test_data = pd.read_csv(""../input/test.csv"",low_memory= False)
store_data = pd.read_csv(""../input/store.csv"",low_memory= False)
test_copy = test_data",No,5,45.0
"print(""Shape of Train data :"", train_data.shape)
print(""Shape of Test data :"", test_data.shape)
print(""Shape of Store data :"", store_data.shape)",No,5,58.0
train_data.head(),No,5,41.0
store_data.head(100),No,5,41.0
train_data.isnull().sum(),No,5,39.0
test_data.isnull().sum(),No,5,39.0
store_data.isnull().sum().sort_values(ascending = False),No,5,39.0
store_data['Promo2SinceWeek'].unique(),No,5,57.0
train_data['Store'].unique(),No,5,57.0
train_data['DayOfWeek'].unique(),No,5,57.0
train_data['Open'].unique(),No,5,57.0
train_data['StateHoliday'].unique(),No,5,57.0
train_data['Promo'].unique(),No,5,57.0
store_data['CompetitionOpenSinceMonth'].unique(),No,5,57.0
"print(sum(train_data[""Open""] == 0))
print(sum(train_data[""Open""] == 1))",No,5,72.0
"print(sum(test_data[""Open""] == 0))
print(sum(test_data[""Open""] == 1))",No,5,72.0
"print(sum(train_data[""StateHoliday""] == 'a'))
print(sum(train_data[""StateHoliday""] == 'b'))
print(sum(train_data[""StateHoliday""] == 'c'))
print(sum(train_data[""StateHoliday""] == 0))'",No,5,72.0
"plt.plot(train_data['DayOfWeek'],train_data['Customers'])",No,5,81.0
"train_data[['Sales','Customers','Promo','SchoolHoliday']].corr(method='pearson')",No,5,40.0
"train_data['Mon'] = train_data[""Date""].apply(lambda x : int(x[5:7]))
train_data['Yr'] = train_data[""Date""].apply(lambda x : int(x[:4]))
train_data[""HolidayBin""] = train_data.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,16.0
"test_data['Mon'] = test_data[""Date""].apply(lambda x : int(x[5:7]))
test_data['Yr'] = test_data[""Date""].apply(lambda x : int(x[:4]))
test_data[""HolidayBin""] = test_data.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,5,16.0
"train_data = train_data.merge(store_data)
test_data =test_data.merge(store_data)",No,5,32.0
train_data.isnull().sum().sort_values(ascending= False),No,5,39.0
test_data.isnull().sum().sort_values(ascending= False),No,5,39.0
test_data[test_data['Open'].isnull()],No,5,14.0
"for i in train_data['Promo2SinceWeek'].unique() :
    print(i ,':', sum(train_data['Promo2SinceWeek'] == i ))
",No,5,72.0
"for i in train_data['CompetitionOpenSinceMonth'].unique() :
    print(i ,':', sum(train_data['CompetitionOpenSinceMonth'] == i ))",No,5,72.0
"for i in train_data['Promo2SinceYear'].unique() :
    print(i ,':', sum(train_data['Promo2SinceYear'] == i ))",No,5,72.0
"for i in train_data['CompetitionOpenSinceYear'].unique() :
    print(i ,':', sum(train_data['CompetitionOpenSinceYear'] == i ))",No,5,72.0
"train_data = train_data.drop(['Customers', 'Store','Date','StateHoliday'],axis= 1 )
test_data = test_data.drop(['Date','StateHoliday','Store','Id'],axis= 1 )",No,5,10.0
sum(train_data['Open'] == 0),No,5,72.0
train_data = train_data.drop(train_data[train_data['Open'] == 0].index.tolist()),No,5,10.0
train_data[train_data['HolidayBin'].isnull()],No,5,14.0
"train_data['CompetitionOpenSinceMonth'] = train_data['CompetitionOpenSinceMonth'].fillna(9.0)
train_data['HolidayBin'] = train_data['HolidayBin'].fillna(0)
train_data['Promo2SinceWeek'] = train_data['Promo2SinceWeek'].fillna(40.0)
train_data['Promo2SinceYear'] = train_data['Promo2SinceYear'].fillna(2012.0)
train_data['CompetitionOpenSinceYear'] = train_data['CompetitionOpenSinceYear'].fillna(2012.0)
train_data['CompetitionDistance'] = train_data['CompetitionDistance'].fillna(train_data['CompetitionDistance'].mean())

train_data.isnull().sum().sort_values(ascending = False)",No,5,17.0
"test_data['Open'] = test_data['Open'].fillna(1)
test_data['CompetitionOpenSinceMonth'] = test_data['CompetitionOpenSinceMonth'].fillna(9.0)
test_data['CompetitionDistance'] = test_data['CompetitionDistance'].fillna(train_data['CompetitionDistance'].mean())
test_data['CompetitionOpenSinceYear'] = test_data['CompetitionOpenSinceYear'].fillna(2012.0)
test_data['Promo2SinceWeek'] = test_data['Promo2SinceWeek'].fillna(40.0)
test_data['Promo2SinceYear'] = test_data['Promo2SinceYear'].fillna(2012.0)

test_data.isnull().sum().sort_values(ascending = False)",No,4,17.0
sum(train_data['Sales'] < 0 ),No,5,72.0
train_data.head(100),No,5,41.0
"categorical_train = train_data.columns.tolist()
print(categorical_train)
train_data[categorical_train].corr(method='pearson')",No,3,80.0
"train_features = train_data.drop(['Open'],axis = 1)
categorical_train = train_features.columns.tolist()
print(categorical_train)
train_data[categorical_train].corr(method='pearson')
train_features = train_data.drop(['Sales'],axis = 1)
full_features = pd.concat([train_features,test_data],ignore_index= True)
print(train_features.shape)
print(test_data.shape)",No,2,80.0
full_features.head(),No,5,41.0
full_features.shape,No,5,58.0
"full_features = pd.get_dummies(full_features,columns= ['HolidayBin','Assortment','StoreType'])",No,5,20.0
"full_features = full_features.drop('PromoInterval',axis = 1)",No,5,10.0
"train_features = full_features.iloc[:844392,:].values
test_data = full_features.iloc[844392:,:].values
train_sales = train_data['Sales'].values",No,5,13.0
"print(train_features.shape)
print(train_sales.shape)
print(test_data.shape)",No,5,58.0
"xgboost = XGBRegressor(learning_rate=0.009, n_estimators=500,
                                     max_depth=10, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006, random_state=42)",No,5,4.0
"xgboost.fit(train_features,train_sales)
",No,5,7.0
predictions = xgboost.predict(test_data),No,5,48.0
"pred_df = pd.DataFrame({""Id"": test_copy[""Id""], 'Sales': predictions})
pred_df.to_csv(""xgboost_4_submission.csv"", index=False)'",No,4,25.0
"import os
import string
import numpy as np
import pandas as pd
from pandasql import sqldf

import matplotlib.pyplot as plt

from keras.utils.np_utils import to_categorical
from keras.models import Model, Sequential, model_from_json
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding
from keras.initializers import RandomNormal, Constant
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
from keras import backend as K
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import seaborn as sns
import warnings

from math import sqrt

import itertools
from tqdm import tqdm

np.random.seed(42)  # for reproducibility

sns.set(style=""whitegrid"", color_codes=True)
sns.set(font_scale=1)

pd.set_option('display.max_columns', 60)

%matplotlib inline
warnings.filterwarnings('ignore')'",No,5,23.0
"b""def concat_data():\n    df_train = pd.read_csv('../input/train.csv')\n    df_test = pd.read_csv('../input/test.csv')\n    df_extra = pd.read_csv('../input/store.csv')\n    df_test['Sales'] = -1\n    df_full = pd.concat([df_train, df_test]).reset_index(drop=True)\n\n    #Merge extra information about stores\n    df_full = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left')\n    \n    df_full['Year'] = pd.DatetimeIndex(df_full['Date']).year\n    df_full['Month'] = pd.DatetimeIndex(df_full['Date']).month\n    df_full['Day'] = pd.DatetimeIndex(df_full['Date']).day\n    df_full['WeekOfYear'] = pd.DatetimeIndex(df_full['Date']).weekofyear\n    \n    # Calculate competition open in months\n    df_full['CompetitionOpen'] = 12 * (df_full.Year - df_full.CompetitionOpenSinceYear) + \\\n        (df_full.Month - df_full.CompetitionOpenSinceMonth)\n\n    # Calculate promo open time in months\n    df_full['PromoOpen'] = 12 * (df_full.Year - df_full.Promo2SinceYear) + \\\n        (df_full.WeekOfYear - df_full.Promo2SinceWeek) / 4.0\n    df_full['PromoOpen'] = df_full.PromoOpen.apply(lambda x: x if x > 0 else 0)\n    df_full.loc[df_full.Promo2SinceYear == 0, 'PromoOpen'] = 0\n\n    # Transform month interval in a boolean column \n    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',\n                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\n    df_full['monthStr'] = df_full.Month.map(month2str)\n    df_full.loc[df_full.PromoInterval == 0, 'PromoInterval'] = ''\n    df_full['IsPromoMonth'] = 0\n    for interval in df_full.PromoInterval.unique():\n        interval = str(interval)\n        if interval != '':\n            for month in interval.split(','):\n                df_full.loc[(df_full.monthStr == month) & (df_full.PromoInterval == interval), 'IsPromoMonth'] = 1\n\n\n    return df_full\n\ndf_full = concat_data()""",Yes,2,45.0
"def extrat_test_data(df_full):
    df_train = df_full.loc[df_full['Sales'] != -1]
    df_test = df_full.loc[df_full['Sales'] == -1]

    return df_train, df_test

df_train, df_test = extrat_test_data(df_full)",No,4,13.0
df_full.head(),No,5,41.0
"# Function to calculate missing values by column (By DSA)
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print (""Your selected dataframe has "" + str(df.shape[1]) + "" columns.\
""      
        ""There are "" + str(mis_val_table_ren_columns.shape[0]) +
          "" columns that have missing values."")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

missing_values_table(df_full)'",No,4,39.0
df_train_store.dtypes,No,5,70.0
df_full.groupby('StoreType')['Sales'].describe(),No,5,40.0
"df_full.groupby('StoreType')['Customers', 'Sales'].sum()",No,5,60.0
"# Plotting correlations
num_feat=df_full.columns[df_full.dtypes!=object]
num_feat=num_feat[1:-1] 
labels = []
values = []
for col in num_feat:
    labels.append(col)
    values.append(np.corrcoef(df_full[col].values, df_full['Sales'].values)[0,1])
    
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(10,15))
rects = ax.barh(ind, np.array(values), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel(""Correlation coefficient"")
ax.set_title(""Correlation Coefficients w.r.t Sales"")'",No,3,80.0
"# Heatmap of correlations features
corrMatrix=df_full[[""Sales"", ""DayOfWeek"", ""Open"", ""Promo"", ""SchoolHoliday"", ""CompetitionDistance"",
                    ""CompetitionOpenSinceMonth"", ""CompetitionOpenSinceYear"", ""Promo2"",
                    ""Promo2SinceWeek"", ""Promo2SinceYear"", ""Year"", ""Month"", ""Day"",
                    ""CompetitionOpen"", ""PromoOpen"", ""IsPromoMonth"", ""Store""]].corr()

sns.set(font_scale=1.10)
plt.figure(figsize=(30, 30))

sns.heatmap(corrMatrix, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='viridis',linecolor=""white"")
plt.title('Correlation between features')'",No,4,80.0
"def clean_data(use_text_columns = True):
    '''
    Function that clean data and create a new features to enrich the model
    '''
    cols_num = [""Sales"", ""DayOfWeek"", ""Open"", ""Promo"", ""SchoolHoliday"", ""CompetitionDistance"",
                ""CompetitionOpenSinceMonth"", ""CompetitionOpenSinceYear"", ""Promo2"",
                ""Promo2SinceWeek"", ""Promo2SinceYear"", ""Wapp"", ""Avg_Customers"", ""Year"", ""Month"", ""Day"",
                ""CompetitionOpen"", ""PromoOpen"", ""IsPromoMonth"", ""Store""]

    cols_text = [""StateHoliday"", ""StoreType"", ""Assortment""]

    df_train = pd.read_csv('../input/train.csv')    
    len_train_data = len(df_train)

    df_test = pd.read_csv('../input/test.csv')

    # Setting null values of column Open in test dataset
    df_test.loc[df_test['DayOfWeek'] != 7, 'Open'] = 1
    df_test.loc[df_test['DayOfWeek'] == 7, 'Open'] = 0

    avg_customer = sqldf(
      """"""
      SELECT
      Store,
      DayOfWeek,
      sum(case when Customers is not null then Sales/Customers else 0 end) as Wapp,
      round(avg(Customers)) Avg_Customers
      from df_train
      group by Store,DayOfWeek
      """"""
    )
    
    df_test = sqldf(
      """"""
      SELECT
      t.*,
      ac.Wapp,
      ac.Avg_Customers
      from df_test t
      left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek
      """"""
    )
    
    df_train = sqldf(
      """"""
      SELECT
      t.*,
      ac.Wapp,
      ac.Avg_Customers
      from df_train t
      left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek
      """"""
    )

    # Merge train and test dataset
    all_data = pd.concat([df_train, df_test], ignore_index=True)

    df_extra = pd.read_csv('../input/store.csv')
    df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

    # Merge extra information about stores
    all_data = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left')

    # Separate date in Year, Month and Day
    all_data.loc[all_data['StateHoliday'] == 0, 'StateHoliday'] = 'd'
    all_data['Year'] = pd.DatetimeIndex(all_data['Date']).year
    all_data['Month'] = pd.DatetimeIndex(all_data['Date']).month
    all_data['Day'] = pd.DatetimeIndex(all_data['Date']).day
    all_data['WeekOfYear'] = pd.DatetimeIndex(all_data['Date']).weekofyear

    # Calculate competition open in months
    all_data['CompetitionOpen'] = 12 * (all_data.Year - all_data.CompetitionOpenSinceYear) + \\
        (all_data.Month - all_data.CompetitionOpenSinceMonth)

    # Calculate promo open time in months
    all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \\
        (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0
    all_data['PromoOpen'] = all_data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    all_data.loc[all_data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    
    # Transform month interval in a boolean column 
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    all_data['monthStr'] = all_data.Month.map(month2str)
    all_data.loc[all_data.PromoInterval == 0, 'PromoInterval'] = ''
    all_data['IsPromoMonth'] = 0
    for interval in all_data.PromoInterval.unique():
        interval = str(interval)
        if interval != '':
            for month in interval.split(','):
                all_data.loc[(all_data.monthStr == month) & (all_data.PromoInterval == interval), 'IsPromoMonth'] = 1

    data_numeric = all_data[cols_num]
    
    # Fill NAN values
    # Only column CompetitionDistance is fill NaN with a median value
    data_numeric['CompetitionDistance'].fillna(data_numeric['CompetitionDistance'].median(), inplace = True)

    # Other values is fill with zero
    data_numeric.fillna(0, inplace = True)

    if (use_text_columns):
        data_text = all_data[cols_text]
        data_text = pd.get_dummies(data_text, dummy_na=False)

        complete_data = pd.concat([data_numeric, data_text], axis = 1)

        df_train = complete_data.iloc[:len_train_data,:]
        df_test = complete_data.iloc[len_train_data:,:]
    else:
        df_train = data_numeric.iloc[:len_train_data,:]
        df_test = data_numeric.iloc[len_train_data:,:]

    return df_train, df_test'",Yes,3,43.0
"def load_train_data(scaler_x, scaler_y):
    '''
    Transform train data set and separate a test dataset to validate the model in the end of training and normalize data
    '''
    X_train = train.drop([""Sales""], axis=1) # Features
    y_train = np.array(train[""Sales""]).reshape((len(X_train), 1)) # Targets
    X_train = scaler_x.fit_transform(X_train)
    y_train = scaler_y.fit_transform(y_train)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

    return (X_train, y_train), (X_test, y_test)'",No,3,13.0
"def load_test_data():
    '''
    Remove column of predictions and normalize data of submission test data set.
    '''
    X_test = test.drop([""Sales""], axis=1) # Features
    X_test = StandardScaler().fit_transform(X_test)

    return X_test'",No,4,20.0
"b""# Show info of model\ndef show_info(model, X, y, log, weights = None):\n    '''\n    Show metrics about the evaluation model and plots about loss, rmse and rmspe\n    '''\n    if (log != None):\n        # summarize history for loss\n        plt.figure(figsize=(14,10))\n        plt.plot(log.history['loss'])\n        plt.plot(log.history['val_loss'])\n        plt.title('Model Loss')\n        plt.ylabel('loss')\n        plt.xlabel('epoch')\n        plt.legend(['train', 'test'], loc='upper left')\n        plt.show()\n        print('\\n')\n        \n        # summarize history for rmse\n        plt.figure(figsize=(14,10))\n        plt.plot(log.history['rmse'])\n        plt.plot(log.history['val_rmse'])\n        plt.title('Model RMSE')\n        plt.ylabel('rmse')\n        plt.xlabel('epoch')\n        plt.legend(['train', 'test'], loc='upper left')\n        plt.show()\n        print('\\n')\n        \n        # summarize history for rmspe\n        plt.figure(figsize=(14,10))\n        plt.plot(log.history['rmspe'])\n        plt.plot(log.history['val_rmspe'])\n        plt.title('Model RMSPE')\n        plt.ylabel('rmspe')\n        plt.xlabel('epoch')\n        plt.legend(['train', 'test'], loc='upper left')\n        plt.show()\n\n    if (weights != None):\n        model.load_weights(weights)\n\n    predictions = model.predict(X, verbose=1)\n\n    mse = mean_squared_error(y, predictions)\n    rmse = sqrt(mse)\n    rmspe = rmspe_val(y, predictions)\n\n    print('MSE: %.3f' % mse)\n    print('RMSE: %.3f' % rmse)\n    print('RMSPE: %.3f' % rmspe)""",Yes,3,28.0
"def rmspe_val(y_true, y_pred):
    '''
    RMSPE calculus to validate evaluation metric about the model
    '''
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true), axis=0))[0]",No,5,84.0
"def rmspe(y_true, y_pred):
    '''
    RMSPE calculus to use during training phase
    '''
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true), axis=-1))",No,5,84.0
"def rmse(y_true, y_pred):
    '''
    RMSE calculus to use during training phase
    '''
    return K.sqrt(K.mean(K.square(y_pred - y_true)))",No,5,84.0
"def create_model():
    '''
    Create a neural network
    '''
    initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)

    model = Sequential()
    model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation=""linear"", kernel_initializer=initializer))
    adam = Adam(lr=1e-3, decay=1e-3)

    # Compile model
    model.compile(loss=""mean_squared_error"", optimizer=adam, metrics=[rmse, rmspe])

    return model'",No,3,4.0
"train, test = clean_data(use_text_columns = True)",No,3,13.0
"# Hyperparameters and load data to train the model
batch_size = 512
nb_epoch = 300

scaler_x = StandardScaler()
scaler_y = StandardScaler()

print('Loading data...')
(X_train, y_train), (X_test, y_test) = load_train_data(scaler_x, scaler_y)

print('Build model...')
model = create_model()
model.summary()",Yes,3,4.0
"print('Fit model...')
filepath=""weights_rossmann.best.hdf5""
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
callbacks_list = [checkpoint, early_stopping]

log = model.fit(X_train, y_train,
          validation_split=0.20, batch_size=batch_size, epochs=nb_epoch, shuffle=True, callbacks=callbacks_list)'",No,4,7.0
"show_info(model, X_test, y_test, log, weights='weights_rossmann.best.hdf5')",No,4,35.0
"test_data = load_test_data()

df_teste = pd.read_csv('../input/test.csv')",No,5,45.0
"predict = model.predict(test_data)
predict = scaler_y.inverse_transform(predict)",No,4,48.0
"submission = pd.DataFrame()
submission['Id'] = df_teste[""Id""]
submission['Sales'] = predict

submission.to_csv('submission.csv', index=False)'",No,4,25.0
"df_correlation=df_train_store[['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo',
        'SchoolHoliday',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'SalesperCustomer', 'Month', 'Year',
       'Day', 'StateHoliday_cat', 'Assortment_cat', 'StoreType_cat',
       'PromoInterval_cat']]",No,5,77.0
"df_correlation=df_correlation.drop('Open', axis = 1)",No,5,10.0
"upper_triangle = np.zeros_like(df_correlation.corr(), dtype = np.bool)
upper_triangle[np.triu_indices_from(upper_triangle)] = True #make sure we don't show half of the other triangle
f, ax = plt.subplots(figsize = (15, 10))
sns.heatmap(df_correlation.corr(),ax=ax,mask=upper_triangle,annot=True, fmt='.2f',linewidths=0.5,cmap=sns.diverging_palette(10, 133, as_cmap=True))",No,3,80.0
df_train_store.columns,No,5,71.0
"df_train_store['CompetitionOpenSince'] = np.where((df_train_store['CompetitionOpenSinceMonth']==0) & (df_train_store['CompetitionOpenSinceYear']==0) , 0,(df_train_store.Month - df_train_store.CompetitionOpenSinceMonth) + 
                                       (12 * (df_train_store.Year - df_train_store.CompetitionOpenSinceYear)) )",No,4,8.0
"#now that CompetitionOpenSince is created 
#we can get rid of `CompetitionOpenSinceYear` and `CompeitionOpenSinceMonth`
del df_train_store['CompetitionOpenSinceYear']
del df_train_store['CompetitionOpenSinceMonth']
",No,5,10.0
"df_train_store[""is_holiday_state""] = df_train_store['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,8.0
del df_train_store['StateHoliday_cat'],No,5,10.0
"df_train_store=pd.get_dummies(df_train_store, columns=[""Assortment"", ""StoreType"",""PromoInterval""], prefix=[""is_Assortment"", ""is_StoreType"",""is_PromoInteval""])",No,5,20.0
"del df_train_store['Assortment_cat']
del df_train_store['StoreType_cat']
",No,5,10.0
del df_train_store['PromoInterval_cat'],No,5,10.0
"df_test = pd.read_csv(""../input/test.csv"",sep=',', parse_dates=['Date']
                       , date_parser=str_to_date,
                       low_memory = False)
print (""The Test dataset has {} Rows and {} Variables"".format(str(df_test.shape[0]),str(df_test.shape[1])))'",No,3,45.0
"df_test.fillna(1, inplace = True) #11rows with Nans decided to leave them open since its one store 622 which is 
#usually open
#Left-join the train to the store dataset since .Why?
#Because you want to make sure you have all events even if some of them don't have their store information ( which shouldn't happen)
df_test_store = pd.merge(df_test, df_store, how = 'left', on = 'Store')
print (""The Test_Store dataset has {} Rows and {} Variables"".format(str(df_test_store.shape[0]),str(df_test_store.shape[1]))) 
df_test_store['Month']=df_test_store.Date.dt.month
df_test_store['Year']=df_test_store.Date.dt.year
df_test_store['Day']=df_test_store.Date.dt.day

df_test_store['StateHoliday'] = df_test_store['StateHoliday'].astype('category')
df_test_store['Assortment'] = df_test_store['Assortment'].astype('category')
df_test_store['StoreType'] = df_test_store['StoreType'].astype('category')
df_test_store['PromoInterval']= df_test_store['PromoInterval'].astype('category')
df_test_store['StateHoliday_cat'] = df_test_store['StateHoliday'].cat.codes
df_test_store['Assortment_cat'] = df_test_store['Assortment'].cat.codes
df_test_store['StoreType_cat'] = df_test_store['StoreType'].cat.codes
df_test_store['PromoInterval_cat'] = df_test_store['PromoInterval'].cat.codes
df_test_store['StateHoliday_cat'] = df_test_store['StateHoliday_cat'].astype('float')
df_test_store['Assortment_cat'] = df_test_store['Assortment_cat'].astype('float')
df_test_store['StoreType_cat'] = df_test_store['StoreType_cat'].astype('float')
df_test_store['PromoInterval_cat'] = df_test_store['PromoInterval_cat'].astype('float')
df_test_store['CompetitionOpenSince'] = np.where((df_test_store['CompetitionOpenSinceMonth']==0) & (df_test_store['CompetitionOpenSinceYear']==0) , 0,(df_test_store.Month - df_test_store.CompetitionOpenSinceMonth) + 
                                       (12 * (df_test_store.Year - df_test_store.CompetitionOpenSinceYear)) )


df_test_store[""is_holiday_state""] = df_test_store['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})


df_test_store=pd.get_dummies(df_test_store, columns=[""Assortment"", ""StoreType"",""PromoInterval""], prefix=[""is_Assortment"", ""is_StoreType"",""is_PromoInteval""])
'",Yes,3,16.0
"del df_test_store[""Date""]
del df_test_store['CompetitionOpenSinceYear']
del df_test_store['CompetitionOpenSinceMonth']
'",No,5,10.0
del df_test_store['StateHoliday_cat'],No,5,10.0
"del df_test_store['Assortment_cat']
del df_test_store['StoreType_cat']
del df_test_store['PromoInterval_cat']",No,5,10.0
del df_test_store['StateHoliday'],No,5,10.0
del df_train_store['StateHoliday'],No,5,10.0
"def rmspe(y, yhat):
    rmspe = np.sqrt(np.mean( (y - yhat)**2 ))
    return rmspe",No,4,49.0
"features = df_train_store.drop(['Customers', 'Sales', 'SalesperCustomer'], axis = 1) 
#a rule of thumb is to transform my target value to log if i see the values are very dispersed which is the case
#and then of course revert them with np.exp to their real values
targets=np.log(df_train_store.Sales)
",Yes,3,10.0
"X_train, X_train_test, y_train, y_train_test = model_selection.train_test_split(features, targets, test_size=0.20, random_state=15)
print (""Training and testing split was successful."")
",No,5,13.0
"rfr = RandomForestRegressor(n_estimators=10, 
                             criterion='mse', 
                             max_depth=5, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='auto', 
                             max_leaf_nodes=None, 
                             min_impurity_decrease=0.0, 
                             min_impurity_split=None, 
                             bootstrap=True, 
                             oob_score=False,
                             n_jobs=4,
                             random_state=31, 
                             verbose=0, 
                             warm_start=False)
rfr.fit(X_train, y_train)
",No,3,7.0
"
'''
params = {'max_depth':(4,6,8,10,12,14,16,20),
         'n_estimators':(4,8,16,24,48,72,96,128),
         'min_samples_split':(2,4,6,8,10)}
#scoring_fnc = metrics.make_scorer(rmspe)
#the dimensionality is high, the number of combinations we have to search is enormous, using RandomizedSearchCV 
# is a better option then GridSearchCV
grid = model_selection.RandomizedSearchCV(estimator=rfr,param_distributions=params,cv=10) 
#choosing 10 K-Folds makes sure i went through all of the data and didn't miss any pattern.(takes time to run but is worth doing it)
grid.fit(X_train, y_train)
'''
#I AM NOT GOING TO RUN THIS CHUNK TO BE ABLE TO COMMIT AND RUN MY KERNEL ON KAGGLE",No,3,6.0
"#with the optimal parameters i got let's see how it behaves with the validation set
rfr_val=RandomForestRegressor(n_estimators=128, 
                             criterion='mse', 
                             max_depth=20, 
                             min_samples_split=10, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='auto', 
                             max_leaf_nodes=None, 
                             min_impurity_decrease=0.0, 
                             min_impurity_split=None, 
                             bootstrap=True, 
                             oob_score=False,
                             n_jobs=4, #setting n_jobs to 4 makes sure you're using the full potential of the machine you're running the training on
                             random_state=35, 
                             verbose=0, 
                             warm_start=False)
model_RF_test=rfr_val.fit(X_train,y_train)",No,3,7.0
yhat=model_RF_test.predict(X_train_test),No,5,53.0
plt.hist(yhat),No,5,33.0
"error=rmspe(y_train_test,yhat)
error",No,5,49.0
"importances = rfr_val.feature_importances_
std = np.std([rfr_val.feature_importances_ for tree in rfr_val.estimators_],
             axis=0)
indices = np.argsort(importances)
palette1 = itertools.cycle(sns.color_palette())
# Store the feature ranking
features_ranked=[]
for f in range(X_train.shape[1]):
    features_ranked.append(X_train.columns[indices[f]])
# Plot the feature importances of the forest

plt.figure(figsize=(10,15))
plt.title(""Feature importances"")
plt.barh(range(X_train.shape[1]), importances[indices],
            color=[next(palette1)], align=""center"")
plt.yticks(range(X_train.shape[1]), features_ranked)
plt.ylabel('Features')
plt.ylim([-1, X_train.shape[1]])
plt.show()

'",Yes,2,33.0
"df_test_store1=df_test_store.drop(['Id'],axis=1)
kaggle_yhat= model_RF_test.predict(df_test_store1)

kaggle_preds= pd.DataFrame({'Id': df_test_store['Id'], 
                          'Sales': np.exp(kaggle_yhat)})
kaggle_preds.to_csv(""Stefano_Zakher_RF_Rossman_Kaggle_submission.csv"", index = False)
'",Yes,3,55.0
"import os
import pandas as pd
import numpy as np
import scipy
import warnings
warnings.filterwarnings(action='ignore')

# Plotting Library
import seaborn as sns 
import matplotlib.pyplot as plt 
plt.style.use('Solarize_Light2')

# Other Libraries
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.stats import ttest_ind ,linregress , ttest_rel
import statsmodels.api as sm
from scipy.stats import probplot
from scipy.stats import zscore
from sklearn.metrics import r2_score
from statsmodels.graphics.regressionplots import influence_plot
from sklearn.preprocessing import PolynomialFeatures , StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso ,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
print(os.listdir(""../input""))'",No,4,88.0
"def ispromomonth(rows):
#   if not rows[0].isnull():
    months = {}
    months = str(rows['PromoInterval']).split(',')
    if str(rows['month_str']) in months:
        return 1
    else:
        return 0
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return ""rmspe"", rmspe(y,yhat)

class Rossmann_:
    def __init__(self , train_data_path = '../input/train.csv' , test_data_path='../input/test.csv' , store_path='../input/store.csv' , nrows =100000):
        self.train_data_path =  train_data_path
        self.test_data_path = test_data_path
        self.store_path = store_path
        self.read_size = nrows
        self.train_data_original = pd.read_csv(self.train_data_path , low_memory = False , nrows = self.read_size)
        self.test_data_original = pd.read_csv(self.test_data_path ,low_memory = False , nrows = self.read_size)
        self.store_data_original = pd.read_csv(self.store_path)
        
        self.start_preprocessing_train(self.train_data_original , self.store_data_original)
        self.start_preprocessing_test(self.test_data_original , self.store_data_original)
    
    def start_preprocessing_train(self , train_data , store):
        train_data.StateHoliday = train_data.StateHoliday.replace('0',0)
        train_data.StateHoliday = train_data.StateHoliday.replace('a',1)
        train_data.StateHoliday = train_data.StateHoliday.replace('b',2)
        train_data.StateHoliday = train_data.StateHoliday.replace('c',3)
        train_data['Date_Year'] = train_data['Date'].apply(lambda x: int(x[:4]))
        train_data['Date_Month'] = train_data['Date'].apply(lambda x: int(x[5:7]))
        train_data['Date_Day'] = train_data['Date'].apply(lambda x: int(x[8:]))
        train_data_m = pd.merge(train_data, store, on='Store')
        mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
        train_data_m.StoreType.replace(mappings, inplace=True)
        train_data_m.Assortment.replace(mappings, inplace=True)
        
        #Finding the week of the year 
        train_data_m['Date'] = pd.to_datetime(train_data_m['Date'], errors='coerce')
        train_data_m['date_WeekOfYear'] = train_data_m.Date.dt.weekofyear
        
        #Combining the Week and Year for Competition and Promo
        train_data_m['Competition_Weeks'] = 12*(train_data_m.Date_Year - train_data_m.CompetitionOpenSinceYear ) + (train_data_m.Date_Month - train_data_m.CompetitionOpenSinceMonth) 
        train_data_m['Promo_Weeks'] = 12*(train_data_m.Date_Year - train_data_m.Promo2SinceYear ) + (train_data_m.Date_Month - train_data_m.Promo2SinceWeek)
        train_data_m['Competition_Weeks'] =  train_data_m['Competition_Weeks'].apply(lambda x: x if x > 0 else 0)
        train_data_m['Promo_Weeks'] =  train_data_m['Promo_Weeks'].apply(lambda x: x if x > 0 else 0)
        
        # is promo month is the months the promo is valid so
        month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
        train_data_m['month_str'] = train_data_m.Date_Month.map(month2str)
        
        train_data_m['IsPromoMonth'] = train_data_m[[ 'PromoInterval' , 'month_str' ]].apply(ispromomonth , axis = 1) 
        train_data_m.fillna(0, inplace=True)
        
        #updating the rows with sales>0 and customes>0 
        train_data_updated = train_data_m[train_data_m['Sales']>0]
        train_data_updated = train_data_updated[train_data_updated['Customers']>0]
        
        features = ['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 'Date_Year', 'Date_Month', 'Date_Day','StoreType', 'Assortment', 'CompetitionDistance','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2','Promo2SinceWeek', 'Promo2SinceYear', 'date_WeekOfYear', 'Competition_Weeks', 'Promo_Weeks', 'IsPromoMonth']   
                
        self.train_final = train_data_updated[features]
        cols = self.train_final.columns
        self.train_final = pd.DataFrame(StandardScaler().fit_transform(self.train_final) , columns = cols)
        
    def start_preprocessing_test(self , test_data , store):
        test_data.fillna(1 , inplace=True)
        # These are all the Oprations appied on the Data
        test_data['Date_Year'] = test_data['Date'].apply(lambda x: int(x[:4]))
        test_data['Date_Month'] = test_data['Date'].apply(lambda x: int(x[5:7]))
        test_data['Date_Day'] = test_data['Date'].apply(lambda x: int(x[8:]))
        test_data_m = pd.merge(test_data, store, on='Store')
        mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
        test_data_m.StoreType.replace(mappings, inplace=True)
        test_data_m.Assortment.replace(mappings, inplace=True)
        test_data_m.StateHoliday.replace(mappings, inplace=True)
        test_data_m['Date'] = pd.to_datetime(test_data_m['Date'], errors='coerce')
        test_data_m['date_WeekOfYear'] = test_data_m.Date.dt.weekofyear
        test_data_m['Competition_Weeks'] = 12*(test_data_m.Date_Year - test_data_m.CompetitionOpenSinceYear ) + (test_data_m.Date_Month - test_data_m.CompetitionOpenSinceMonth) 
        test_data_m['Promo_Weeks'] = 12*(test_data_m.Date_Year - test_data_m.Promo2SinceYear ) + (test_data_m.Date_Month - test_data_m.Promo2SinceWeek)
        test_data_m['Competition_Weeks'] =  test_data_m['Competition_Weeks'].apply(lambda x: x if x > 0 else 0)
        test_data_m['Promo_Weeks'] =  test_data_m['Promo_Weeks'].apply(lambda x: x if x > 0 else 0)
        month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
        test_data_m['month_str'] = test_data_m.Date_Month.map(month2str)
        test_data_m['IsPromoMonth'] = test_data_m[[ 'PromoInterval' , 'month_str' ]].apply(ispromomonth , axis = 1) 
        test_data_m.fillna(0, inplace=True)
        features = ['Store', 'DayOfWeek', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 'Date_Year', 'Date_Month', 'Date_Day','StoreType', 'Assortment', 'CompetitionDistance','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2','Promo2SinceWeek', 'Promo2SinceYear', 'date_WeekOfYear', 'Competition_Weeks', 'Promo_Weeks', 'IsPromoMonth']   
        self.test_final  = test_data_m[features]
        
    def prepare_sample_data(self , limit =100 , testing_limit = 30):
        self.data = self.train_final.sample(frac = 1 , random_state = 98).head(limit)
        self.test_data = self.train_final.sample(frac = 1 , random_state = 98).tail(testing_limit)
        
    def Linear_Regression(self):
        print('Creating Linear Regression Model Between Sales and Customers... ')
        lr = LinearRegression()
        lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1))
        print('Fitting Done on Model ... ')
        print(lr)
        r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1))
#         print('R2 Score is ',r2_score)
#         print('Since the Model R2 Score is ',r2_score , ', the model explains ',round(r2_score*100,2) , ' % of the variation in GI')
        print('Coefficients for the linear regression problem is ',lr.coef_)
        print('Intersect Value is ',lr.intercept_)
        y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1))
        rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred))
        ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1))
        trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred))
        print('Root Mean Squared Error of Training Set is ',rms)
        print('Root Mean Squared Error of Testing Set is ',trms)
        
#         print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1)))
#         print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1)))
        
        plt.figure(figsize=(15,10))
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) ,  self.data['Sales'].values.reshape(-1,1) , color ='r' , label = 'Actual Values')
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b' , label = 'Predicted')
        plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('Linear Regression Training.png')
        
        plt.figure(figsize=(15,10))
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) ,  self.test_data['Sales'].values.reshape(-1,1) , color ='g' , label = 'Actual Values')
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y' , label = 'Predicted')
        plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('Linear Regression Testing.png')
        
    def display_graphs(self,simple = False , orders = 1):
        for i in range(1, orders+1,1):
            lm = sns.lmplot(x =""Customers"", y =""Sales"", data = self.data, scatter = True, order = i, fit_reg = True, ci  = 95 ) 
            lm.fig.suptitle(""Scatter plot with Order = ""+str(i), fontsize=16)
            
    def Mulitple_Linear_Regression(self):
        print('Creating Multiple Linear Regression Model... ')
        print('Using Columns -> ',self.data.drop(columns = ['Sales','Customers']).columns)
        lr = LinearRegression()
        lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values)
        print(lr)
        print('Fitting Done on Model ... ')
        print('Coefficients for the linear regression problem is ',lr.coef_)
        print('Intersect Value is ',lr.intercept_)
        y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values)
        rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred))
        ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values)
        trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred))
        print('Root Mean Squared Error of Training Set is ',rms)
        print('Root Mean Squared Error of Testing Set is ',trms)
#         print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1)))
#         print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1)))

        self.data['pred'] = y_pred
        self.test_data['pred'] = ty_pred
        plt.figure(figsize=(15,10))
        sns.jointplot(x = 'Sales' , y = 'pred' , data = self.data, height=10, ratio=3 , color='g' )
        plt.savefig('Multiple Linear Regression Training.png')
        
        plt.figure(figsize=(15,10))
        sns.jointplot(x = 'Sales' , y = 'pred' , data = self.test_data, height=10, ratio=3 , color='r' )
        plt.savefig('Multiple Linear Regression Testing.png')
        
#         plt.figure(figsize=(15,10))
#         plt.scatter(self.test_data['Customers'].values.reshape(-1,1)  ,  self.test_data['Sales'].values.reshape(-1,1) , color ='g',label = 'Actual Values')
#         plt.scatter(self.test_data['Customers'].values.reshape(-1,1)  , ty_pred , color ='y', label = 'Predicted')
#         plt.plot(self.test_data['Customers'].values.reshape(-1,1)  , ty_pred , color ='k' , label = 'Predicted Line')
#         plt.xlabel('Customers Index')
#         plt.ylabel('Sales Index')
#         plt.legend()
#         plt.savefig('Multiple Linear Regression Testing.png')
    
    def Polynomial_Regression(self , degrees = 4):
        print('To Reduce Complexity...\
Using Single Data Column Customers Rather than All...')
        Input=[('polynomial',PolynomialFeatures(degree=degrees)),('modal',LinearRegression())]
        lr=Pipeline(Input)
        lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1))
        print('Fitting Done on Model ... ')
        r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1))
#         print('R2 Score is ',r2_score)
#         print('Since the Model R2 Score is ',r2_score , ', the model explains ',round(r2_score*100,2) , ' % of the variation in GI')
        self.data.sort_values(by='Customers' , inplace = True)
        self.test_data.sort_values(by='Customers' , inplace = True)
        y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1))
        rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred))
        ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1))
        trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred))
        print('Root Mean Squared Error of Training Set is ',rms)
        print('Root Mean Squared Error of Testing Set is ',trms)
#         print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1)))
#         print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1)))

        plt.figure(figsize=(15,10))
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) ,  self.data['Sales'].values.reshape(-1,1) , color ='r',label = 'Actual Values')
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b', label = 'Predicted')
        plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('Polynomial Regression Training {}.png'.format(degrees))
        
        plt.figure(figsize=(15,10))
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) ,  self.test_data['Sales'].values.reshape(-1,1) , color ='g',label = 'Actual Values')
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y', label = 'Predicted')
        plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('Polynomial Regression Testing {}.png'.format(degrees))
    
    def return_model(self,reg = 'Ridge' , alpha = 0.01):
        if reg == 'Ridge':
            lr = Ridge(alpha=alpha)
        elif reg =='Lasso':
            lr = Lasso(alpha=alpha)
        elif reg =='Elastic':
            lr = ElasticNet(alpha = alpha)
        else:
            lr = Ridge(alpha=alpha , solver = 'cholesky', tol = .005)
        return lr
    
    def Other_Regression(self , reg = 'Ridge'):
        print('Creating Multiple {} Regression Model... '.format(reg))
        print('Using Columns -> ',self.data.drop(columns = ['Sales','Customers']).columns)
        lr = self.return_model(reg = reg)
        lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values)
        print(lr)
        print('Fitting Done on Model ... ')
        print('Coefficients for the linear regression problem is ',lr.coef_)
        print('Intersect Value is ',lr.intercept_)
        y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values)
        rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred))
        ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values)
        trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred))
        print('Root Mean Squared Error of Training Set is ',rms)
        print('Root Mean Squared Error of Testing Set is ',trms)
        
        print('Creating Alpha VS Mean Squared Error Graph for Alpha')
        alphas = []
        train_loss = []
        test_loss = []
        for i in range(10000):
            alphas.append(i*0.0015 +0.0001)
            lr = self.return_model(reg = reg , alpha = (i*0.0015 +0.0001))
            lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values)
            y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values)
            rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred))
            ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values)
            trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred))
            train_loss.append(rms)
            test_loss.append(trms)
        
        plt.figure(figsize=(15,10))
        plt.plot(alphas , train_loss , color ='r' , label = 'Training Loss')
        plt.xlabel('Alpha')
        plt.ylabel('Loss (RMSE)')
        plt.legend()
        plt.savefig('{} Regression Alpha Training.png'.format(reg))
        plt.figure(figsize=(15,10))
        plt.plot(alphas , test_loss , color ='g' , label = 'Testing Loss')
        plt.xlabel('Alpha')
        plt.ylabel('Loss (RMSE)')
        plt.legend()
        plt.savefig('{} Regression Alpha Testing.png'.format(reg))
        
        print('Using Single Column now ....')
        lr = self.return_model(reg = reg)
        
        lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1))
        print('Fitting Done on Model ... ')
        print(lr)
        r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1))
        print('Coefficients for the linear regression problem is ',lr.coef_)
        print('Intersect Value is ',lr.intercept_)
        y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1))
        rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred))
        ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1))
        trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred))
        print('Root Mean Squared Error of Training Set is ',rms)
        print('Root Mean Squared Error of Testing Set is ',trms)
        
        plt.figure(figsize=(15,10))
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) ,  self.data['Sales'].values.reshape(-1,1) , color ='r' , label = 'Actual Values')
        plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b' , label = 'Predicted')
        plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('{} Regression Training.png'.format(reg))
        
        plt.figure(figsize=(15,10))
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) ,  self.test_data['Sales'].values.reshape(-1,1) , color ='g' , label = 'Actual Values')
        plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y' , label = 'Predicted')
        plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line')
        plt.xlabel('Customers Index')
        plt.ylabel('Sales Index')
        plt.legend()
        plt.savefig('{} Regression Testing.png'.format(reg))'",Yes,1,7.0
ross = Rossmann_(),No,5,77.0
"ross.prepare_sample_data(limit =200 , testing_limit = 40)
ross.Linear_Regression()",No,4,7.0
"ross.prepare_sample_data(limit =2000 , testing_limit = 400)
ross.Mulitple_Linear_Regression()",No,4,7.0
"ross.prepare_sample_data(limit =10000 , testing_limit = 4000)
ross.Polynomial_Regression(degrees = 3)",No,4,7.0
"ross.prepare_sample_data(limit =10000 , testing_limit = 4000)
ross.Polynomial_Regression(degrees = 2)",No,4,7.0
"ross.prepare_sample_data(limit =1000 , testing_limit = 400)
ross.Other_Regression(reg = 'Ridge')",No,4,7.0
"ross.prepare_sample_data(limit =1000 , testing_limit = 400)
ross.Other_Regression(reg = 'Lasso')",No,4,7.0
"ross.prepare_sample_data(limit =1000 , testing_limit = 400)
ross.Other_Regression(reg = 'Elastic')",No,4,7.0
"ross.prepare_sample_data(limit =1000 , testing_limit = 400)
ross.Other_Regression(reg = 'Bridge')",No,4,7.0
"# Get list of categorical variables

new= data[data.columns[~data.columns.isin(['Open Date','days','year','month'])]]

numerical_features = new.select_dtypes([np.number]).columns.tolist()
categorical_features = new.select_dtypes(exclude = [np.number,np.datetime64]).columns.tolist()
categorical_features
",No,4,8.0
"from sklearn import preprocessing
from sklearn.model_selection import train_test_split",No,5,22.0
"#data = data.drop('Id', axis=1)
#test_data = test_data.drop('Id', axis=1)
y= data.revenue

x_train = data[data.columns[~data.columns.isin(['Open Date','revenue'])]]  #train features to be fit in model
x_test = test_data[test_data.columns[~test_data.columns.isin(['Open Date'])]]  #test features
",No,5,21.0
"from sklearn.preprocessing import LabelEncoder
# Processing the categorical columns to provide vector form of feature
class DataFrameProcess:
    def __init__(self,df,col):
        self.df =df
        self.col=col
    def dataEncoding(self):
        if self.df[self.col].dtype.name == 'object' or self.df[self.col].dtype.name == 'category':
            le = LabelEncoder()
            self.df[self.col] = le.fit_transform(self.df[self.col])    


def data_transform(df):  
    for col in df.columns:
        data_prcs = DataFrameProcess(df,col)
        data_prcs.dataEncoding()  
data_transform(x_train) 
data_transform(x_test)",Yes,4,8.0
x_train.head(5),No,5,41.0
"from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

gbRegr = GradientBoostingRegressor(max_depth=3, random_state=42)
gbRegr.fit(x_train, y)
prediction_rr = gbRegr.predict(x_test)
",Yes,4,7.0
"test_label=pd.read_csv('../input/restaurant-revenue-prediction/sampleSubmission.csv')  # test target
test_label.head(10)",No,4,45.0
"from sklearn.metrics import mean_squared_error
from math import sqrt
label_list=test_label['Prediction'].tolist()",Yes,5,16.0
"print('Root Mean squared error {}'.format(sqrt(mean_squared_error(label_list, prediction_rr))))
",No,5,49.0
"from sklearn import ensemble

params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.05, 'loss': 'ls'}
GBR = ensemble.GradientBoostingRegressor(**params)

GBR.fit(x_train, y)
preds_GBR = GBR.predict(x_test)

GradientBoostingRegressor_RMSE= sqrt(mean_squared_error(label_list, preds_GBR))

print('Root Mean squared error {}'.format(GradientBoostingRegressor_RMSE))",Yes,3,48.0
"parameters = [{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 
                     'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 
                                       0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
                    }]
from sklearn.model_selection import GridSearchCV
gsearch = GridSearchCV(estimator=XGBRegressor(),
                       param_grid = parameters, 
                       scoring='neg_mean_absolute_error',
                       n_jobs=4,cv=3)

gsearch.fit(x_train,y)
gsearch.best_params_, gsearch.best_score_",Yes,4,7.0
"final_model = XGBRegressor(n_estimators=gsearch.best_params_.get('n_estimators'), 
                           learning_rate=gsearch.best_params_.get('learning_rate'), 
                           n_jobs=4)",No,5,4.0
"final_model.fit(x_train, y)",No,5,7.0
preds_test = final_model.predict(x_test),No,5,48.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(x_train, y)
rf_val_predictions = rf_model.predict(x_test)
RMSE = sqrt(mean_squared_error(label_list,rf_val_predictions))
print(RMSE)

",Yes,4,7.0
"submission = pd.DataFrame({
        ""Id"": test_data[""Id""],
        ""Prediction"": rf_val_predictions
    })
submission.to_csv('submission.csv',header=True, index=False)
print('done')'",No,5,25.0
"b""month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \\\n         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\nall_data['month_str'] = all_data.Month.map(month2str)\n\ndef check(row):\n    if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:\n        return 1\n    else:\n        return 0\n\nall_data['IsPromoMonth'] =  all_data.apply(lambda row: check(row),axis=1)    """,No,5,20.0
"tmp = all_data[all_data['part']=='train']
sns.boxplot('IsPromoMonth', 'Sales', data=tmp)",No,5,33.0
"all_data ['isBeforeCompetition'] = all_data.apply(lambda x: 1 if x['Year'] < x['CompetitionOpenSinceYear'] else 0, 1)",No,5,14.0
"tmp = all_data[all_data['part']=='train']
sns.boxplot('isBeforeCompetition', 'Sales', data=tmp)",No,5,33.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    data[data['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    
plt.tight_layout()
plt.show()'",No,5,75.0
"fig, ax = plt.subplots (1,5, figsize=(25,4))
sns.boxplot('StoreType',  'Sales','Promo', data=data, ax=ax[0])
sns.boxplot('StoreType', 'Sales', 'SchoolHoliday', data=data, ax=ax[1])
sns.boxplot('StoreType','Sales','Assortment',  data=data, ax=ax[2])
sns.boxplot('StoreType', 'Sales', 'StateHoliday', data=data, ax=ax[3])
sns.boxplot('StoreType', 'Sales', 'Promo2', data=data, ax=ax[4])
plt.tight_layout()",No,5,33.0
"grid = sns.FacetGrid(data, col=""StoreType"", row=""Promo"", palette=""tab10"", col_order=""abcd"")
grid.map(sns.pointplot, ""Month"", ""Sales"")
plt.show()",No,5,33.0
all_data['SalesPerCustomer'] = data['Sales']/data['Customers'],No,5,8.0
"grid = sns.FacetGrid(all_data, col=""StoreType"", row=""Promo"", palette=""tab10"", col_order=""abcd"")
grid.map(sns.pointplot, ""Month"", ""SalesPerCustomer"")
plt.show()",No,5,33.0
"mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
all_data.StoreType.replace(mappings, inplace=True)
all_data.Assortment.replace(mappings, inplace=True)
all_data.StateHoliday.replace(mappings, inplace=True)",No,5,20.0
"def prepareDf (df, submission=False):
    
    tests_date = ""2015-06-12""
    tmp_data = all_data[all_data['part']=='train'] .copy()
    if not submission:
        tmp_data = tmp_data[tmp_data['Date']<tests_date]
        
    shift = 48

    cols = ['isBeforeCompetition', 'IsPromoMonth', 'PromoOpen', 'DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'Assortment', 'Quarter']

    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'StoreType',j], aggfunc='mean').reset_index().rename(columns={i: i+'_StrType_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','StoreType', j])

    cols = ['DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'StoreType', 'Assortment', 'PromoInterval', 'Quarter',
           'isBeforeCompetition', 'IsPromoMonth', 'PromoOpen']


    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', j], aggfunc='mean').reset_index().rename(columns={i: i+'_'+j})
            df = pd.merge(df, tmp, 'left', ['Store', j])

   
    df['rolling_mean_t7_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(7).mean())
    df['rolling_mean_t30_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(30).mean())
    df['rolling_mean_t360_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(360).mean())

    df['rolling_mean_t7_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(7).mean())
    df['rolling_mean_t30_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(30).mean())
    df['rolling_mean_t360_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(360).mean())

    df['shift_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift))
    df['shift_t7_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+7))
    df['shift_t30_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+30))
    df['shift_t360_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+360))
    
    df['shift_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift))
    df['shift_t7_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+7))
    df['shift_t30_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+30))
    df['shift_t360_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+360))

    cols = ['Quarter']

    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'Year',j], aggfunc='mean').reset_index().rename(columns={i: i+'_Year_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','Year', j])

   
    cols = ['DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'Assortment']

    for i in ['SalesPerCustomer']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'StoreType',j], aggfunc='mean').reset_index().rename(columns={i: i+'_StrType_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','StoreType', j])

    return df'",No,5,53.0
df = prepareDf(all_data),No,2,13.0
df[df['Store']== 696 ].sort_values('Date'),No,5,9.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_sales', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()'",No,5,75.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_t7_sales', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()'",No,5,75.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'rolling_mean_t7_sales', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()'",No,5,75.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Customers', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_t30_customer', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()'",No,5,75.0
"fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Customers', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'rolling_mean_t7_customer', ax=ax[p])
    ax[p] .set_title(""Store %d"" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()'",No,5,33.0
"#
train = pd.read_csv(""/kaggle/input/rossmann-store-sales/train.csv"")
test = pd.read_csv(""/kaggle/input/rossmann-store-sales/test.csv"")
store = pd.read_csv(""/kaggle/input/rossmann-store-sales/store.csv"")
state = pd.read_csv(""/kaggle/input/rossmann-store-extra/store_states.csv"")
state_name = pd.read_csv(""/kaggle/input/rossmann-store-extra/state_names.csv"")
weathers = pd.read_csv(""/kaggle/input/rossmann-store-extra/weather.csv"")
L = len(test)'",No,4,45.0
"b""def rmspe(y_pred, y_test):\n    return np.sqrt(np.mean(((y_pred - y_test) / y_test)**2))\n\n#Date\n#Month, DayEmbedding0\ndef date(data):\n    data['Date'] = pd.to_datetime(data['Date'])\n    data['Year'] = data['Date'].dt.year\n    data['Year'] = data['Year'] - data['Year'].min()\n    data['Month'] = data['Date'].dt.month - 1\n    data['Day'] = data['Date'].dt.day - 1\n    data['DayOfWeek'] = data['Date'].dt.dayofweek\n    data['WeekOfYear'] = data['Date'].dt.weekofyear\n    data['Days'] = (data['Date'] - data['Date'].min()).dt.days\n    data['Days'] = data['Days'] / data['Days'].max()\n    data['DayOfMonth'] = [0 if i<=10 else 1 if i<=20 else 2 for i in data['Day']]\n    data['QuadYear'] = [0 if i<=13 else 1 if i<=26 else 2 if i<=39 else 3 for i in data['WeekOfYear']]\n    return data\n\n#\ndef label(data, column):\n    unique = data[column].unique()\n    k = 0\n    for str in unique:\n        data.loc[data[column] == str, column] = k\n        k += 1\n\n#EntityEmbedding\ndef replace(data, weights, features, drop=True):\n    for feature in features:\n        data = data.merge(weights[feature], how='left', on=[feature])\n        if drop == True:\n            data = data.drop([feature], axis=1)\n    return data\n\n#store\ndef pre_store(store, state):\n    store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)\n    store.fillna(0, inplace = True)\n    store = store.merge(state, how='left', on=['Store'])\n    return store\n\n#weather\ndef pre_weather(data, weather):\n    weather = weather.rename(columns={'file': 'StateName'})\n    weather = weather.merge(state_name, how='left', on=['StateName'])\n    weather['Date'] = pd.to_datetime(weather['Date'])\n    data = data.merge(weather, how='left', on=['State', 'Date'])\n    data['Events'].fillna('NaN', inplace = True)\n    label(data, 'Events')\n    label(data, 'State')\n    data['Max_Gust_SpeedKm_h'].fillna(0, inplace=True)\n    data['CloudCover'].fillna(data['CloudCover'].median(), inplace = True)\n    data['Max_VisibilityKm'].fillna(data['Max_VisibilityKm'].median(), inplace = True)\n    data['Mean_VisibilityKm'].fillna(data['Mean_VisibilityKm'].median(), inplace = True)\n    data['Min_VisibilitykM'].fillna(data['Min_VisibilitykM'].median(), inplace = True)\n    data['CloudCover'].fillna(data['CloudCover'].median(), inplace = True)\n    data = data.drop(['Date', 'StateName'], axis=1)\n    return data\n\n#\ndef preprocess(train, test, store):\n    train['Id'] = 0\n    test.fillna(1, inplace=True)\n    data = pd.concat([train, test], sort=False)\n    data = date(data)\n    data = data[(data['Open']!=0) & (data['Sales']!=0)]\n    data = pd.merge(data, store, how = 'inner', on = 'Store')\n    data['Store'] = data['Store'] - 1\n    data['CompetitionOpen'] = 12 * (2015 - data.Year - data.CompetitionOpenSinceYear) + \\\n        (data.Month - data.CompetitionOpenSinceMonth)\n    data['PromoOpen'] = 12 * (2015 - data.Year - data.Promo2SinceYear) + \\\n        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0\n    data['SalesLog'] = data['Sales'].map(math.log)\n    data['CustomersLog'] = data['Customers'].map(math.log)\n    data.loc[data['StateHoliday'] == 0, 'StateHoliday'] = '0'\n    label(data, 'StateHoliday')\n    label(data, 'StoreType')\n    label(data, 'Assortment')\n    label(data, 'PromoInterval')\n    data = data.drop(['Customers', 'Open'], axis=1)\n    return data""",Yes,3,8.0
"class EntitiyEmbedding:
    def __init__(self):
        self.input_model = []
        self.output_model = []
        self.features = []
        self.embeddings = []

    def add(self, feature, input_shape, output_shape):
        self.features.append(feature)
        self.embeddings.append(feature)
        input_model = Input(shape=(1,), name=(feature + '_input'))
        output_model = Embedding(input_shape, output_shape, name=(feature + '_embedding'))(input_model)
        output_model = Reshape(target_shape=(output_shape,))(output_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def dense(self, feature, output_shape):
        self.features.append(feature)
        input_model = Input(shape=(1,), name=(feature + '_input'))
        output_model = Dense(output_shape, name=(feature + '_dense'))(input_model)
        self.input_model.append(input_model)
        self.output_model.append(output_model)

    def concatenate(self):
        output_model = Concatenate()(self.output_model)
        output_model = Dense(1000, kernel_initializer=""uniform"")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(500, kernel_initializer=""uniform"")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)
        self.model = KerasModel(inputs=self.input_model, outputs=output_model)
        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def split_features(self, X):
        X_list = {}
        for feature in self.features:
            X_list[feature + '_input'] = X[[feature]]
        return X_list

    def fit(self, X_train, y_train, X_test, y_test, epochs=12, batch_size=128):
        self.X_test = X_test
        self.model.fit(self.split_features(X_train), y_train,
                       validation_data=(self.split_features(X_test), y_test),
                       epochs=epochs,
                       batch_size=batch_size)

    def predict(self, X=None):
        if X is None:
            X = self.X_test
        pred = self.model.predict(self.split_features(X))
        return pred

    def get_weight(self):
        weights = {}
        for feature in self.embeddings:
            w = self.model.get_layer(feature + '_embedding').get_weights()[0]
            columns = []
            for i in range(w.shape[1]):
                columns.append(feature + '_' + str(i))
            w = pd.DataFrame(w, columns=columns)
            w.index.names = [feature]
            weights[feature] = w
        return weights'",No,2,7.0
"b""#\nstore = pre_store(store, state)\ndata = preprocess(train, test, store)\ndata = pre_weather(data, weathers)\n\nsmax = data['SalesLog'].max()\ncmax = data['CustomersLog'].max()""",No,3,8.0
"b""#EntitiyEmbedding\n# model.add(())\n#0, 1 model.dense()\n\nmodel = EntitiyEmbedding()\nmodel.add('Store', input_shape=1115, output_shape=10)\nmodel.add('DayOfWeek', input_shape=7, output_shape=6)\nmodel.add('Year', input_shape=3, output_shape=2)\nmodel.add('Day', input_shape=31, output_shape=10)\nmodel.add('Month', input_shape=12, output_shape=6)\nmodel.add('DayOfMonth', input_shape=3, output_shape=2)\nmodel.add('QuadYear', input_shape=4, output_shape=3)\nmodel.add('State', input_shape=12, output_shape=6)\nmodel.add('StateHoliday', input_shape=4, output_shape=3)\nmodel.add('WeekOfYear', input_shape=53, output_shape=10)\nmodel.add('Assortment', input_shape=3, output_shape=2)\nmodel.add('StoreType', input_shape=4, output_shape=3)\nmodel.add('PromoInterval', input_shape=4, output_shape=3)\nmodel.add('Events', input_shape=22, output_shape=10)\nmodel.dense('Promo', output_shape=1)\nmodel.concatenate()""",No,3,4.0
"b""#traintest\ntrain = data[data['Id']==0]\ntrain = train.sample(frac=1, random_state=22)\n\ntest = data[data['Id']!=0]\nId = test['Id']\n33\nX = train.copy()\ny = train[['CustomersLog', 'SalesLog']]\n\n#20\n#\nX_train, X_ee, y_train, y_ee = train_test_split(X, y, test_size=200000, random_state=44)""",No,4,21.0
"b""#\nmodel.fit(X_ee, y_ee['CustomersLog']/cmax, X_train, y_train['CustomersLog']/cmax, epochs=12)""",No,5,7.0
"b""#\nweights = model.get_weight()\n\n#\nX_train = replace(X_train, weights, model.embeddings)\ntest = replace(test, weights, model.embeddings)\n\n#\nX_train = X_train.drop(['Sales', 'Id', 'SalesLog', 'CustomersLog', \n                        'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa', \n                  'Min_Sea_Level_PressurehPa'], axis=1)\ntest = test.drop(['Sales', 'Id', 'SalesLog', 'CustomersLog', \n                        'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa', \n                  'Min_Sea_Level_PressurehPa'], axis=1)\n\nX_train.head()""",No,2,21.0
"#LightGBM

params = {""objective"" : ""rmse"",
          ""boosting"" : ""gbdt"", 
          ""metric"" : ""rmse"",
          ""num_iterations"" : 7500,
          ""top_k"" : 30, 
          ""max_depth"" : 8, 
          ""num_leaves"" : 800, 
          ""min_data_in_leaf"" : 20, 
          ""learning_rate"" : 0.02,
          ""bagging_fraction"" : 0.7, 
          ""bagging_seed"" : 3,
          ""bagging_freq"" : 5, 
          ""feature_fraction"" : 0.5, 
          ""num_threads"" : 4
         }

dataset_params = {""max_bin"" : 200, 
                  ""min_data_in_bin"" : 3 
                 }
lgb_train = lgb.Dataset(X_train, label=y_train['SalesLog'], params=dataset_params)
model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True)'",No,3,7.0
"pred = model.predict(test)
pred = np.exp(pred)
pred = pd.DataFrame(pred.T, index=Id, columns=['Sales'])

df = pd.DataFrame(range(1, L+1), columns=['Id'])
df = df.merge(pred, how='left', on=['Id'])
df.fillna(0, inplace=True)
df.to_csv(""/kaggle/working/submission.csv"", index=False)'",No,4,25.0
"data = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
print(data.shape)
print(store.shape)
print(test.shape)",No,4,45.0
store.head(),No,5,41.0
"data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')
#store_id=data.Store.unique()[0]
#print(store_id)
store_rows=data[data['Store']==1]
print(store_rows.shape)
store_rows.resample('1D',on='Date')['Sales'].sum().plot.line(figsize=(14,4))",No,3,16.0
"#CHECKING THE STARTING AND END DATE
print(data['Date'].min())
print(data['Date'].max())",No,5,40.0
data['Sales'].plot.hist()    #right skewed,No,5,33.0
" # CHECKING MISSING VALUES
store.isna().sum()",No,5,39.0
"store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0)
store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0])
store['PromoInterval'] = store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0])

store['CompetitionDistance']=store['CompetitionDistance'].fillna(store['CompetitionDistance'].max())
store['CompetitionOpenSinceMonth']=store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0])
store['CompetitionOpenSinceYear']=store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0])",Yes,2,45.0
store['Promo2SinceYear'].mode().iloc[0],No,5,40.0
"data_merged = data.merge(store,on='Store',how='left')
data_merged.head()",No,5,32.0
"#partitioning the date into day,month,year
data_merged['day'] = data_merged['Date'].dt.day
data_merged['month'] = data_merged['Date'].dt.month
data_merged['year'] = data_merged['Date'].dt.year
#data_merged['Date'].dt.strftime('%a')-",No,5,8.0
"#CATEGORICAL-stateholiday,storetype,assortment,promointerval
data_merged['StateHoliday'] = data_merged['StateHoliday'].map({'0':0,0:0,'a':1,'b':2,'c':3})
data_merged['StateHoliday'] = data_merged['StateHoliday'].astype(int)",No,4,20.0
"data_merged['Assortment'] = data_merged['Assortment'].map({'a':1,'b':2,'c':3})
data_merged['Assortment'] = data_merged['Assortment'].astype(int)",No,4,20.0
"data_merged['StoreType'] = data_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
data_merged['StoreType'] = data_merged['StoreType'].astype(int)",No,4,20.0
"map_promo={'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
data_merged['PromoInterval']=data_merged['PromoInterval'].map(map_promo)
data_merged['PromoInterval'] = data_merged['PromoInterval'].astype(int)",No,4,20.0
"# Train & Test split
from sklearn.model_selection import train_test_split
X=data_merged.drop(['Date','Sales'],axis=1)
y=np.log(data_merged['Sales']+1)
train_x,validate_x,train_y,validate_y = train_test_split(X,y,test_size=0.20,random_state=1)
train_x.shape,validate_x.shape,train_y.shape,validate_y.shape",Yes,4,21.0
"from sklearn.tree import DecisionTreeRegressor

model_dt=DecisionTreeRegressor(max_depth=10,random_state=1)
model_dt.fit(train_x,train_y)",Yes,4,7.0
"#Code for RMSPE Value
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe",No,3,28.0
"from sklearn.metrics import r2_score, mean_squared_error
y_pred=model_dt.predict(validate_x)
y_pred_exp=np.exp(y_pred)-1
validate_y_exp=np.exp(validate_y)-1
print(""R-squared:"", r2_score(validate_y_exp, y_pred_exp))
print(""RMSE:"", np.sqrt(mean_squared_error(validate_y_exp, y_pred_exp)))
print('RMSPE',rmspe(validate_y_exp, y_pred_exp))
'",No,4,27.0
"#checking the feature which has most importance
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
plt.bar(X.columns,model_dt.feature_importances_)",No,5,79.0
"stores_avg_cust=data.groupby(['Store'])['Customers'].mean().reset_index().astype(int)
stores_avg_cust
test_1=test.merge(stores_avg_cust,on='Store',how='left')
test_merg=test_1.merge(store,on='Store',how='left')
test_merg['Open']=test_merg['Open'].fillna(1)
test_merg['Date']=pd.to_datetime(test_merg['Date'],format='%Y-%m-%d')
test_merg['day'] = test_merg['Date'].dt.day
test_merg['month'] = test_merg['Date'].dt.month
test_merg['year'] = test_merg['Date'].dt.year",No,3,8.0
test_merg.describe(),No,5,40.0
"#CATEGORICAL-stateholiday,storetype,assortment,promointerval
test_merg['StateHoliday'] = test_merg['StateHoliday'].map({'0':0,0:0,'a':1,'b':2,'c':3})
test_merg['StateHoliday'] = test_merg['StateHoliday'].astype(int)

test_merg['Assortment'] = test_merg['Assortment'].map({'a':1,'b':2,'c':3})
test_merg['Assortment'] = test_merg['Assortment'].astype(int)

test_merg['StoreType'] = test_merg['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
test_merg['StoreType'] = test_merg['StoreType'].astype(int)

map_promo={'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
test_merg['PromoInterval']=test_merg['PromoInterval'].map(map_promo)
test_merg['PromoInterval'] = test_merg['PromoInterval'].astype(int)",No,3,20.0
"test_merged=test_merg.drop(['Id','Date'],axis=1)
test_merged.describe()",No,4,40.0
"test_pred=model_dt.predict(test_merg[X.columns])
test_pred_inv=np.exp(test_pred)-1
submission_predict = pd.DataFrame({'ID':test['Id'],'Sales':test_pred_inv})",No,5,48.0
submission_predict,No,5,41.0
"submission_predict.to_csv('submission.csv',index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns",No,5,22.0
"data = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store= pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test= pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')",No,5,45.0
"print(data.shape)
print(store.shape)",No,5,58.0
"data.info()
# data.dtypes",No,5,40.0
data.describe(include='object'),No,5,40.0
"data.describe()[['Sales','Customers']]",No,5,40.0
"data.describe()[['Sales','Customers']].loc['mean']",No,5,40.0
"data.describe()[['Sales','Customers']].loc['min']",No,5,40.0
"data.describe()[['Sales','Customers']].loc['max']",No,5,40.0
data.Store.nunique(),No,5,54.0
"data.head()
data.Store.value_counts().head(50).plot.bar()",No,5,33.0
data.Store.value_counts().tail(50).plot.bar(),No,5,33.0
data.Store.value_counts(),No,5,72.0
data.DayOfWeek.value_counts(),No,5,72.0
data.Open.value_counts(),No,5,72.0
data.Promo.value_counts(),No,5,72.0
"data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')
store_id= data.Store.unique()[0]
print(store_id)
store_rows=data[data['Store']==store_id]
print(store_rows.shape)
# store_rows.resample('1D',on='Date')['Sales'].sum().plot.line(figsize=(14,4))",No,3,57.0
"test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')
store_test_rows = test[test['Store']==store_id]
store_test_rows['Date'].min(),store_test_rows['Date'].max()",No,4,40.0
"store_rows['Sales'].plot.hist()
# it is slightly skewed.",No,5,33.0
"data['Sales'].plot.hist()
# it is slightly skewed.",No,5,33.0
store_id=store[store['Store']==1].T,No,5,14.0
store[~store['Promo2SinceYear'].isna()].iloc[0],No,3,17.0
"# Method1
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
store['Promo2SinceWeek']= store['Promo2SinceWeek'].fillna(0)
store['Promo2SinceYear']= store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0])
store['PromoInterval']= store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0]) 

store['CompetitionDistance']=store['CompetitionDistance'].fillna(store['CompetitionDistance'].max())
store['CompetitionOpenSinceMonth']= store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0])
store['CompetitionOpenSinceYear']= store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0])
store.isna().sum()
      ",Yes,3,45.0
"data_merged = data.merge(store, on='Store',how='left')
print(data.shape)
print(data_merged.shape)
print(data_merged.isna().sum().sum()) #to cross check if there are any missing values",No,4,32.0
"# encoding
# 3 categorical column,1 date column, rest are numerical
# data_merged.dtypes
data_merged['day']=data_merged['Date'].dt.day
data_merged['month']=data_merged['Date'].dt.month
data_merged['year']=data_merged['Date'].dt.year
#data_merged['dayofweek']=data_merged['Date'].dt.strftime('%a')
",No,5,8.0
"# data_merged.dtypes
# StateHoliday,StoreType,Assortment,PromoInterval
data_merged['StateHoliday'].unique()
# for creating dummy variables - label encoding is used
data_merged['StateHoliday']=data_merged['StateHoliday'].map({'0':0,0:0,'a':1,'b':2,'c':3})
data_merged['StateHoliday']=data_merged['StateHoliday'].astype(int)
data_merged",No,3,20.0
"# encoding assorted
data_merged['Assortment']
# for creating dummy variables - label encoding is used
data_merged['Assortment']=data_merged['Assortment'].map({'a':1,'b':2,'c':3})
data_merged['Assortment']=data_merged['Assortment'].astype(int)
data_merged",No,4,20.0
"data_merged['StoreType'].unique()
data_merged['StoreType']=data_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
data_merged['StoreType']=data_merged['StoreType'].astype(int)
data_merged",No,3,20.0
"data_merged['PromoInterval'].unique()
map_promo = {'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
data_merged['PromoInterval']=data_merged['PromoInterval'].map(map_promo)
data_merged",No,4,20.0
"# Train and validate Split
features= data_merged.columns.drop(['Sales','Date'])
from sklearn.model_selection import train_test_split
train_x,validate_x,train_y,validate_y = train_test_split(data_merged[features],np.log(data_merged['Sales']+1),test_size=0.2,random_state=1)
train_x.shape,validate_x.shape,train_y.shape,validate_y.shape",No,3,13.0
"# from sklearn.tree import DecisionTreeRegressor

# model_dt = DecisionTreeRegressor(max_depth=20,random_state=1).fit(train_x,train_y)
# validate_y_pred = model_dt.predict(validate_x)

from sklearn.tree import DecisionTreeRegressor
model_dt=DecisionTreeRegressor(max_depth=10,random_state=1).fit(train_x,train_y)
validate_y_pred=model_dt.predict(validate_x)",Yes,3,7.0
!pip install pydotplus,No,5,87.0
"def draw_tree(model, columns):
    import pydotplus
    from sklearn.externals.six import StringIO
    from IPython.display import Image
    import os
    from sklearn import tree
    
    graphviz_path = 'C:\\Program Files (x86)\\Graphviz2.38/bin/'
    os.environ[""PATH""] += os.pathsep + graphviz_path

    dot_data = StringIO()
    tree.export_graphviz(model,
                         out_file=dot_data,
                         feature_names=columns)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())'",No,5,84.0
"draw_tree(model_dt,features)",No,5,53.0
"validate_y_pred = model_dt.predict(validate_x)
from sklearn.metrics import mean_squared_error
validate_y_inv = np.exp(validate_y) - 1
validate_y_pred_inv = np.exp(validate_y_pred) - 1
np.sqrt(mean_squared_error(validate_y_inv , validate_y_pred_inv))",No,4,49.0
model_dt.feature_importances_,No,5,79.0
"import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.barh(features,model_dt.feature_importances_)
pd.Series(model_dt.feature_importances_,index=features)",No,5,79.0
"# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': list(range(5,15))}
base_model= DecisionTreeRegressor()
cv_model = GridSearchCV(base_model, param_grid = parameters).fit(train_x, train_y)
parameters",Yes,3,7.0
cv_model.best_params_,No,5,2.0
pd.DataFrame(cv_model.cv_results_),No,5,41.0
"pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False) #[['param_max_depth','mean_test_score']]
#differnt types tried with different max depth",No,5,9.0
"df_cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)
df_cv_results.set_index('param_max_depth')['mean_test_score'].plot.line()
df_cv_results.set_index('param_max_depth')['mean_train_score'].plot.line()",No,5,35.0
"stores_avg_cust = data.groupby(['Store'])[['Customers']].mean().reset_index().astype(int)
test_1 = test.merge(stores_avg_cust,on='Store',how='left')
test.shape,test_1.shape
test_merged = test_1.merge(store,on='Store',how='inner')
test_merged['Open']=test_merged['Open'].fillna(1)
test_merged['Date']=pd.to_datetime(test_merged[""Date""],format='%Y-%m-%d')
test_merged['day']=test_merged['Date'].dt.day
test_merged['month']=test_merged['Date'].dt.month
test_merged['year']=test_merged['Date'].dt.year
test_merged['StateHoliday']=test_merged['StateHoliday'].map({'0':0,'a':1})
test_merged['StateHoliday']=test_merged['StateHoliday'].astype(int)
test_merged['Assortment']=test_merged['Assortment'].map({'a':1,'b':2,'c':3})
test_merged['Assortment']=test_merged['Assortment'].astype(int)
test_merged['StoreType']=test_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
test_merged['StoreType']=test_merged['StoreType'].astype(int)
map_promo = {'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
test_merged['PromoInterval']=test_merged['PromoInterval'].map(map_promo)
'",Yes,2,32.0
test_merged,No,5,41.0
"test_pred = model_dt.predict(test_merged[features])
test_pred_inv = np.exp(test_pred) - 1",No,5,48.0
"submission = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')
submission_predicted = pd.DataFrame({'Id':test['Id'],'Sales':test_pred_inv})
submission_predicted.to_csv('submission.csv',index=False)
submission_predicted.head()",No,4,25.0
"def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe
# rmspe(validate_y_inv,validate_y_pred_inv)
validate_y_inv=np.exp(validate_y)-1 #becaused we added +1 while log transformation
validate_y_pred_inv=np.exp(validate_y_pred)-1
rmse_val=np.sqrt(mean_squared_error(validate_y_inv,validate_y_pred_inv))
rmspe_val=rmspe(validate_y_inv,validate_y_pred_inv)
print(rmse_val,rmspe_val)",No,5,49.0
"data=pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')",No,5,45.0
"data.head()
",No,5,41.0
"import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
import matplotlib.pyplot as plt",No,5,22.0
"data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')
store1 = data[data['Store']==1]
store1.head()
store1.resample('1D',on='Date')['Sales'].sum().plot.line(figsize=(14,4))
plt.show()",No,4,16.0
"test=pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')
store_id=test.Store.unique()[0]
store_test_rows=test[test['Store']==store_id]
store_test_rows['Date'].min(),store_test_rows['Date'].max()",No,3,16.0
"print(test.isna().sum())
print(store.isna().sum())",No,5,39.0
"store['Promo2SinceWeek']=store['Promo2SinceWeek'].fillna(0)
store['Promo2SinceYear']=store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().loc[0])
store['PromoInterval']=store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0])",No,5,17.0
"store['CompetitionDistance']=store['CompetitionDistance'].fillna(store['CompetitionDistance'].max())
store['CompetitionOpenSinceMonth']=store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0])
store['CompetitionOpenSinceYear']=store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0])",No,5,17.0
"data_merged=data.merge(store,on='Store',how='left')
print(data.shape)
print(data_merged.shape)",No,3,32.0
data_merged.isna().sum(),No,5,39.0
"data_merged['Date']=pd.to_datetime(data_merged['Date'],format='%Y-%m-%d')
data_merged['day']=data_merged['Date'].dt.day
data_merged['year']=data_merged['Date'].dt.year
data_merged['month']=data_merged['Date'].dt.month",No,4,8.0
data_merged.info(),No,5,40.0
data_merged.StoreType.value_counts(),No,5,72.0
data_merged.StateHoliday.value_counts(),No,5,72.0
"data_merged['StateHoliday']=data_merged['StateHoliday'].map({'0':0,0:0,'a':1,'b':2,'c':3})
data_merged['StateHoliday']=data_merged['StateHoliday'].astype(int)",No,4,20.0
data_merged.Assortment.value_counts(),No,5,72.0
"data_merged['Assortment']=data_merged['Assortment'].map({'a':0,'b':1,'c':2})
data_merged['Assortment']=data_merged['Assortment'].astype(int)",No,4,20.0
data_merged['PromoInterval'].value_counts(),No,5,72.0
"data_merged['PromoInterval']=data_merged['PromoInterval'].map({'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3})
data_merged['PromoInterval']=data_merged['PromoInterval'].astype(int)",No,4,20.0
data_merged['StoreType'].value_counts(),No,5,72.0
"data_merged['StoreType']=data_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
data_merged['StoreType']=data_merged['StoreType'].astype(int)",No,5,20.0
"X=data_merged.drop('Sales',axis=1)
y=data_merged.Sales",No,5,21.0
"X=data_merged.drop(['Sales','Date','Customers'],axis=1)
y=data_merged.Sales
X_train, X_test, y_train, y_test = train_test_split(X, np.log(y+1), test_size=0.2, random_state=1)",No,4,21.0
"X_train.shape,X_test.shape,y_train.shape,y_test.shape",No,5,58.0
"from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR",No,5,22.0
"model_dtr=DTR(max_depth=11,random_state=1).fit(X_train,y_train)
y_pred=model_dtr.predict(X_test)",Yes,3,7.0
"y_test_inv=np.exp(y_test)-1
y_pred_inv=np.exp(y_pred)-1
print('RMSE',np.sqrt(mean_squared_error(y_test_inv,y_pred_inv)))
print('Accuracy',r2_score(y_test_inv,y_pred_inv))",No,2,48.0
"def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

y_test_inv=np.exp(y_test)-1
y_pred_inv=np.exp(y_pred)-1
rmse_test = np.sqrt(mean_squared_error(y_test_inv,y_pred_inv))
rmspe_test = rmspe(y_test_inv,y_pred_inv)
print(rmse_test,rmspe_test)",No,5,49.0
from sklearn.model_selection import GridSearchCV,No,5,22.0
model_dtr.feature_importances_,No,5,79.0
"store_avg_cust=data.groupby(['Store'])[['Customers']].mean().reset_index().astype(int)
test_1=test.merge(store_avg_cust,on='Store',how='left')
test.shape,test_1.shape
test_merged=test_1.merge(store,on='Store',how='left')
test_merged['Open']=test['Open'].fillna(1)
test_merged['Date']=pd.to_datetime(test_merged['Date'],format='%Y-%m-%d')
test_merged['day']=test_merged['Date'].dt.day
test_merged['month']=test_merged['Date'].dt.month
test_merged['year']=test_merged['Date'].dt.year
test_merged['StateHoliday']=test_merged['StateHoliday'].map({'0':0,'a':1})
test_merged['StateHoliday']=test_merged['StateHoliday'].astype(int)
test_merged['Assortment']=test_merged['Assortment'].map({'a':0,'b':1,'c':2})
test_merged['Assortment']=test_merged['Assortment'].astype(int)
test_merged['StoreType']=test_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
test_merged['StoreType']=test_merged['StoreType'].astype(int)
test_merged['PromoInterval']=test_merged['PromoInterval'].map({'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3})
test_merged['PromoInterval']=test_merged['PromoInterval'].astype(int)",Yes,2,32.0
"test_pred=model_dtr.predict(test_merged[data_merged.columns.drop(['Sales','Date','Customers'])])
test_pred_inv=np.exp(test_pred)-1
submission_predicted=pd.DataFrame({'Id':test['Id'],'Sales':test_pred_inv})
submission_predicted.to_csv('submission.csv',index=False)
submission_predicted.head()",Yes,4,48.0
from sklearn.ensemble import AdaBoostRegressor,No,5,22.0
"import seaborn as sns
import matplotlib.pyplot as plt",No,5,22.0
"store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
data =  pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')",No,5,45.0
"print(data.shape,store.shape)",No,5,58.0
"data=pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test=pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
print(data.shape)
print(store.shape)
print(test.shape)",No,4,45.0
"train=pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
test=pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')",No,5,45.0
"print(train.shape)
print(test.shape)
print(store.shape)",No,5,58.0
train['StateHoliday'].value_counts(),No,5,72.0
"train.describe()[['Sales','Customers']]",No,5,40.0
"train.describe()[['Sales','Customers']].loc['mean']",No,5,40.0
"train.describe()[['Sales','Customers']].loc['min']",No,5,40.0
"train.describe()[['Sales','Customers']].loc['max']",No,5,40.0
train['Store'].value_counts().head(20),No,5,41.0
train['Store'].value_counts().tail(20),No,5,41.0
train['DayOfWeek'].value_counts(),No,5,72.0
train['Open'].value_counts(),No,5,72.0
train['Promo'].value_counts(),No,5,72.0
"train['Date']=pd.to_datetime(train['Date'],format='%Y-%m-%d')",No,5,16.0
store.isna().sum(),No,5,39.0
store1=train[train['Store']==1],No,5,14.0
store1.head(),No,5,41.0
store1.shape,No,5,58.0
"store1.resample('1d',on='Date')['Sales'].sum().plot.line(figsize=(15,5))",No,5,75.0
store1[store1['Sales']==0],No,5,14.0
"test_store1=test[test['Store']==1]
test_store1['Date']=pd.to_datetime(test_store1['Date'],format='%Y-%m-%d')",No,5,16.0
"test_store1['Date'].min(),test_store1['Date'].max()",No,5,40.0
test_store1['Open'].value_counts(),No,5,72.0
"import seaborn as sns
import matplotlib.pyplot as plt
plt.hist(store1['Sales'])",Yes,3,22.0
store[store['Store']==1].T,No,5,14.0
store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv'),No,5,45.0
store['Promo2SinceWeek']=store['Promo2SinceWeek'].fillna(0),No,5,17.0
store['Promo2SinceYear']=store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0]),No,5,17.0
store['PromoInterval']=store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0]),No,5,17.0
"df=train.merge(store,on='Store',how='left')",No,5,32.0
"print(train.shape)
print(df.shape)",No,5,58.0
df.isna().sum(),No,5,39.0
df.info(),No,5,40.0
"df['day']=df['Date'].dt.day
df['year']=df['Date'].dt.year
df['month']=df['Date'].dt.month",No,5,8.0
"#Dummies: StateHoliday,StoreType,Assortment,PromoInterval

df['StateHoliday']=df['StateHoliday'].apply(lambda x:'0' if x==0 or x=='0' else x)",No,5,8.0
"df['StateHoliday']=df['StateHoliday'].map({'0':0,'a':1,'b':2,'c':3})
df['StateHoliday']=df['StateHoliday'].astype(int)",No,4,20.0
df['StoreType'].value_counts(),No,5,72.0
"df['StoreType']=df['StoreType'].map({'a':0,'b':1,'c':2,'d':3})",No,5,20.0
"df['Assortment']=df['Assortment'].map({'a':0,'b':1,'c':2})",No,5,20.0
df['Assortment']=df['Assortment'].astype(int),No,5,16.0
df['PromoInterval'].value_counts(),No,5,72.0
"df['PromoInterval']=df['PromoInterval'].map({'Jan,Apr,Jul,Oct':0,'Feb,May,Aug,Nov':1,'Mar,Jun,Sept,Dec':2})",No,5,20.0
df['PromoInterval']=df['PromoInterval'].astype(int),No,5,16.0
"df=df.drop('Date',1)",No,5,10.0
y=np.log1p(df['Sales']),No,5,21.0
"X=df.drop(['Sales','Customers'],1)",No,5,10.0
"from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,random_state=1,test_size=0.2)",No,5,13.0
y.plot.hist(),No,5,33.0
"from sklearn.tree import DecisionTreeRegressor

dt=DecisionTreeRegressor(max_depth=11,random_state=1).fit(X_train,y_train)",No,3,7.0
y_pred_val=dt.predict(X_val),No,5,48.0
"from sklearn.metrics import r2_score,mean_squared_error

print(r2_score(y_val,y_pred_val))
print(np.sqrt(mean_squared_error(y_val,y_pred_val)))",No,5,49.0
"y_val_exp=np.exp(y_val)-1
y_pred_val_exp=np.exp(y_pred_val)-1
np.sqrt(mean_squared_error(y_val_exp,y_pred_val_exp))",No,5,49.0
"r2_score(y_val_exp,y_pred_val_exp)",No,5,49.0
"draw_tree(dt,X_train.columns)",No,5,84.0
"print(dt.feature_importances_)
plt.barh(y=X_train.columns,width=dt.feature_importances_)
plt.show()",No,5,79.0
"avg_cust=df.groupby(['Store'])[['Customers']].mean().astype(int)
test1=test.merge(avg_cust,on='Store',how='left')",Yes,3,60.0
"test.shape,test1.shape",No,5,58.0
"test_merged=test1.merge(store,on='Store',how='left')",No,5,32.0
"test1.shape,test_merged.shape",No,5,58.0
test_merged.head(),No,5,41.0
"test_merged['Open'].fillna(1,inplace=True)",No,5,17.0
test_merged.isna().sum(),No,5,39.0
"test_merged['Date']=pd.to_datetime(test_merged['Date'],format='%Y-%m-%d')
test_merged['day']=test_merged['Date'].dt.day
test_merged['month']=test_merged['Date'].dt.month
test_merged['year']=test_merged['Date'].dt.year
",No,5,8.0
"test_merged=test_merged.drop('Date',1)",No,5,10.0
test_merged['StateHoliday']=test_merged['StateHoliday'].apply(lambda x:'0' if x==0 or x=='0' else x),No,5,8.0
test_merged['StateHoliday'].value_counts(),No,5,72.0
test_merged['StoreType'].value_counts(),No,5,72.0
test_merged['Assortment'].value_counts(),No,5,72.0
test_merged['PromoInterval'].value_counts(),No,5,72.0
"test_merged['StateHoliday']=test_merged['StateHoliday'].map({'0':0,'a':1})
test_merged['StateHoliday']=test_merged['StateHoliday'].astype(int)

test_merged['StoreType']=test_merged['StoreType'].map({'a':0,'b':1,'c':2,'d':3})
test_merged['StoreType']=test_merged['StoreType'].astype(int)

test_merged['Assortment']=test_merged['Assortment'].map({'a':0,'b':1,'c':2})
test_merged['Assortment']=test_merged['Assortment'].astype(int)

test_merged['PromoInterval']=test_merged['PromoInterval'].map({'Jan,Apr,Jul,Oct':0,'Feb,May,Aug,Nov':1,'Mar,Jun,Sept,Dec':2})
test_merged['PromoInterval']=test_merged['PromoInterval'].astype(int)",No,5,20.0
"test_merged1=test_merged.drop('Id',1)",No,5,10.0
test_merged1.head(),No,5,41.0
X_train.head(),No,5,41.0
test_merged1.shape,No,5,58.0
y_pred=dt.predict(test_merged1[X_train.columns]),No,5,48.0
y_pred,No,5,41.0
y_pred_exp=np.exp(y_pred)-1,No,5,55.0
"submission_pred=pd.DataFrame(test_merged['Id'],columns=['Id'])",No,5,12.0
submission_pred['Sales']=y_pred_exp,No,3,12.0
"submission_pred['Id']=np.arange(1,len(submission_pred)+1)",No,3,12.0
submission_pred,No,5,41.0
"# Credit: kaggle.com
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

rmse_val=np.sqrt(mean_squared_error(y_val_exp,y_pred_val_exp))
rmspe_val=rmspe(y_val_exp,y_pred_val_exp)
print(rmse_val,rmspe_val)",No,5,49.0
"from sklearn.model_selection import GridSearchCV
params={'max_depth':list(range(5,20))}
base_model=DecisionTreeRegressor()
cv_model=GridSearchCV(base_model,param_grid=params,return_train_score=True).fit(X_train,y_train)",No,5,6.0
"df_cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)",No,5,12.0
"df_cv_results.set_index('param_max_depth')['mean_test_score'].plot.line()
df_cv_results.set_index('param_max_depth')['mean_train_score'].plot.line()
plt.show()",No,4,3.0
"df_cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)[['param_max_depth','mean_test_score','mean_train_score']]
df_cv_results",No,5,55.0
"def get_rmspe_score(model,input_values,y_actual):
    y_predicted=model.predict(input_values)
    y_actual=np.exp(y_actual)-1
    y_predicted=np.exp(y_predicted)-1
    score=rmspe(y_actual,y_predicted)
    return score

params={'max_depth':list(range(5,8))}
base_model=DecisionTreeRegressor()
cv_model=GridSearchCV(base_model,param_grid=params,return_train_score=True,scoring=get_rmspe_score).fit(X_train,y_train)
pd.DataFrame(cv_model.cv_results_)[['params','mean_test_score','mean_train_score']]",No,4,3.0
"from sklearn.ensemble import AdaBoostRegressor

model_ada=AdaBoostRegressor(n_estimators=5).fit(X_train,y_train)",No,5,7.0
model_ada.estimators_[0],No,4,7.0
"features=X_train.columns
draw_tree(model_ada.estimators_[0],features)",No,5,79.0
import xgboost as xgb,No,5,22.0
"dtrain=xgb.DMatrix(X_train,y_train)
dvalidate=xgb.DMatrix(X_val,y_val)

param={'max_depth':5,'eta':1,'ojective':'reg:linear'}
model_xg=xgb.train(param,dtrain,200)
pred_y=model_xg.predict(dvalidate)


val_y_inv=np.exp(y_val)-1
pred_y_inv=np.exp(pred_y)-1
rmspe_val=rmspe(val_y_inv,pred_y_inv)
print(rmspe_val)",No,3,7.0
"test_merged=test_merged.drop(['Id','Customers'],1)",No,3,6.0
y_pred_xg=model_xg.predict(xgb.DMatrix(test_merged[X_train.columns])),No,5,48.0
y_pred_xg_exp=np.exp(y_pred_xg)-1,No,5,55.0
y_pred_xg_exp,No,5,41.0
"submission_predicted1 = pd.DataFrame({'Id': test['Id'], 'Sales': y_pred_xg_exp})
testop0=(test[test['Open']==0]['Open']).index
Sales1=[]
for i in range(41088):
    if i in testop0:
        Sales1.append(0)
    else:
        Sales1.append(submission_predicted1['Sales'][i])
submission_predicted1['Sales']=Sales1
print(submission_predicted1.head())
submission_predicted1.to_csv('submission.csv', index=False)",No,4,25.0
"store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
data=pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
test=pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
submission=pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')",No,5,45.0
"print(data.shape)
print(store.shape)
print(test.shape)
print(submission.shape)",No,5,58.0
data.dtypes  # date and stateholidays as categorical columns,No,5,70.0
"#lets dive into the statistical description of data
data.describe(include='object')  # we have 942 similar dates and 5 similar etries in state holidays",No,5,40.0
"#we have sales and customer column in which we have to focus more on the customer segment as this will also be one of the major factor affecting the sales 

data.describe()[['Sales','Customers']].loc['max']",No,5,40.0
"data.Store.nunique()

data.Store.value_counts().head(20).plot.bar()
data.Store.value_counts().tail(20).value_counts()

data.Store.value_counts()",No,4,72.0
"# lets check for the store to be open
data.Open.value_counts()",No,5,72.0
"#record of dayof week for the store
data.DayOfWeek.value_counts()

#record for the store
data.groupby(['DayOfWeek'])[['Sales']].mean()  # the first day of the week ,the sales is higher",No,3,72.0
"#checking the null values
data.isnull().sum()
store.isnull().sum()   # the store data has more null values so for furthur imputations we have to fill it 
test.isnull().sum()   # there is some missing values in test data too",No,5,39.0
"# convert datetime column to date
data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')",No,5,16.0
"# lets take one store and make visualisations to  se the pattern 
store_id=data.Store.unique()[0]
print(store_id)

store_rows=data[data['Store']==store_id]
store_rows.resample('1D',on='Date')['Sales'].sum().plot.line(figsize=(10,8))

#plotting the 1day sales for the selected store_id over past 2 years before 2015",No,5,81.0
"#carry on our analysis over the picked store

store_rows[store_rows['Sales']==0]  # showing the analysis of no sales for the store 1",No,5,14.0
"test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')
store_test_rows=test[test['Store']==store_id]
store_test_rows['Date'].min(),store_test_rows['Date'].max()",No,3,40.0
"store_test_rows['Open'].value_counts() # the shop will be closed(7days )means the sales will be zero for the given days 
                                       #lets proceed ahead to the exploration for predictig the sales for the dates on test data",No,5,72.0
"store_rows['Sales'].plot.hist()

#it is slightly skewed  in target column means there are certain rows where slaes data is missing",No,5,33.0
"store.head()
store[store['Store']==store_id].T  # we dont know what value imputaion we can do to fill the missinng values so we are takinng a single store and anlaysing the data",No,4,41.0
"#missing value treatment
store.isna().sum()
# its obvious technically to say that the week that have zero promos so we can fill it with zero
store['Promo2SinceWeek']=store['Promo2SinceWeek'].fillna(0)",No,4,17.0
"store['Promo2SinceYear']=store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0])
#technically its wrong to say promos missed for 2 years but for current scenrio we can fill it with mode

store['PromoInterval']=store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0])

#its obvious that there is no competitor
store['CompetitionDistance']=store['CompetitionDistance'].fillna(0)
store['CompetitionOpenSinceMonth']=store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0])
store['CompetitionOpenSinceYear']=store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0])",No,5,17.0
"store['Promo2SinceYear'].mode()
data_merged=data.merge(store,on='Store',how='left')
print(data.shape)
print(data_merged.shape)
#just have a  look over the missing value after merging the data over store data/sometimes there is missing value after the merge
print(data_merged.isna().sum().sum())
",No,3,32.0
"#lets do the  decision tree regresssion  

#Encoding
#3 cat_cols,1date_col,rest are numerical
data_merged.dtypes

data_merged['day']=data_merged['Date'].dt.day
data_merged['month']=data_merged['Date'].dt.month
data_merged['year']=data_merged['Date'].dt.year

# will give the day of week extracion
# data_merged['dayofweek']=data_merged['Date'].dt.strftime('%A')",No,4,17.0
"data_merged.dtypes
#stateholiday,assortment,promointerval,storetype-cat_cols
data_merged['StateHoliday'].unique()

data_merged['StateHoliday']=data_merged['StateHoliday'].map({'0':0,0:0,'a':1,'b':2,'c':3})
data_merged['StateHoliday']=data_merged['StateHoliday'].astype(int)",No,4,20.0
"#check the assortment columns for missing value imputation
data_merged['Assortment'].unique()",No,5,57.0
"#data_merged.dtypes
#stateholiday,assortment,promointerval,storetype-cat_cols
data_merged['Assortment'].unique()

data_merged['Assortment']=data_merged['Assortment'].map({'a':1,'b':2,'c':3})
data_merged['Assortment']=data_merged['Assortment'].astype(int)",No,3,20.0
"map_promo={'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
data_merged['PromoInterval']=data_merged['PromoInterval'].map(map_promo)",No,5,20.0
"#trainn and test validate

features=data_merged.columns.drop(['Sales','Date'])
from sklearn.model_selection import train_test_split
train_x,validate_x,train_y,validate_y=train_test_split(data_merged[features],np.log(data_merged['Sales']+1),test_size=0.2,random_state=1)

train_x.shape,validate_x.shape,train_y.shape,validate_y.shape",No,4,21.0
"#apply decision tree regressor

from sklearn.tree import DecisionTreeRegressor
    
model_dt=DecisionTreeRegressor(max_depth=11,random_state=1).fit(train_x,train_y)
validate_y_pred=model_dt.predict(validate_x)",No,4,7.0
"validate_y_pred=model_dt.predict(validate_x)

from sklearn.metrics import mean_squared_error

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe= np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

validate_y_inv=np.exp(validate_y)-1
validate_y_pred_inv=np.exp(validate_y_pred)-1

rmse_val=np.sqrt(mean_squared_error(validate_y_inv,validate_y_pred_inv))
rmspe_val=rmspe(validate_y_inv,validate_y_pred_inv)
print(rmse_val,rmspe_val)",No,4,28.0
"pd.Series(model_dt.feature_importances_,index=features)",No,5,79.0
"#checking the most important feature for test data from the merged data

import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
yvalues=model_dt.feature_importances_
xvalues=features
plt.bar(xvalues,yvalues)
plt.xticks(rotation=90,color='red')
plt.show()",No,5,79.0
"#to check the importance of each column over the merged data on sales (target variable)
data_merged.corr().loc['Sales'].sort_values(ascending=False)",No,4,80.0
"#we rae finding the average number of customers as it will help in addition of customers  column in test data will help in prediction of customers
store_avg_cust=data.groupby(['Store'])[['Customers']].mean().reset_index().astype(int)",No,3,60.0
"test1=test.merge(store_avg_cust,on='Store',how='left')

test1.shape,test.shape",No,4,32.0
"test_merged['Open']=test_merged['Open'].fillna(1)
test_merged['Date']=pd.to_datetime(test_merged['Date'],format='%Y-%m-%d')
test_merged['day']=test_merged['Date'].dt.day
test_merged['month']=test_merged['Date'].dt.month
test_merged['year']=test_merged['Date'].dt.year


test_merged['StateHoliday']=test_merged['StateHoliday'].map({'0':0,'a':1})
test_merged['StateHoliday']=test_merged['StateHoliday'].astype(int)
test_merged.isna().sum()",Yes,4,16.0
"test_merged['Assortment']=test_merged['Assortment'].map({'a':1,'b':2,'c':3})
test_merged['Assortment']=test_merged['Assortment'].astype(int)
test_merged['StoreType']=test_merged['StoreType'].map({'a':1,'b':2,'c':3,'d':4})
test_merged['StoreType']=test_merged['StoreType'].astype(int)
map_promo={'Jan,Apr,Jul,Oct':1,'Feb,May,Aug,Nov':2,'Mar,Jun,Sept,Dec':3}
test_merged['PromoInterval']=test_merged['PromoInterval'].map(map_promo)",No,4,20.0
"test_pred=model_dt.predict(test_merged[features])
test_pred_inv=np.exp(test_pred)-1


submission_predicted=pd.DataFrame({'Id':test['Id'],'Sales':test_pred_inv})
submission_predicted",No,4,48.0
"submission_predicted.to_csv('submission.csv',index=False)",No,5,25.0
"def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe
validate_y_inv=np.exp(validate_y)-1
validate_y_pred_inv=np.exp(validate_y_pred)-1

rmse_val=np.sqrt(mean_squared_error(validate_y_inv,validate_y_pred_inv))
rmspe_val=rmspe(validate_y_inv,validate_y_pred_inv)
print(rmse_val,rmspe_val)",No,4,28.0
"import numpy as np
from sklearn.model_selection import GridSearchCV
def get_rmspe_score(model,input_values,y_actual):
    y_predicted=model.predict(input_values)
    
    Y_actual=np.exp(y_actual)-1
    y_predicted=np.exp(y_predicted)-1
    score=rmspe(y_actual,y_predicted)
    return score
parameter={'max_depth':list(range(5,8))}
base_model=DecisionTreeRegressor()
cv_model=GridSearchCV(base_model,param_grid=parameter,cv=5,return_train_score=True,scoring=get_rmspe_score).fit(train_x,train_y)
pd.DataFrame(cv_model.cv_results_)[['params','mean_test_score','mean_train_score']]",Yes,3,6.0
"from fastai.tabular import *
import os, tarfile
import random
import matplotlib.pyplot as plt
import pandas as pd
import re
from datetime import *

%matplotlib inline
%reload_ext autoreload
%autoreload 2

np.random.seed(23)
np.set_printoptions(threshold=50, edgeitems=20)",No,5,23.0
!ls -al /kaggle/input/rossmann-time-series-data-engineering,No,5,88.0
"OUTPUT = '/kaggle/working/'
PATH='/kaggle/input/rossmann-time-series-data-engineering/'
df = pd.read_feather(f'{PATH}df')
train_df = pd.read_feather(f'{PATH}joined2')
test_df = pd.read_feather(f'{PATH}joined2_test')
train_df.shape, test_df.shape",No,4,44.0
"cat_vars = ['Store', 'DayOfWeek', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Is_year_end', 'Is_year_start', 'StoreType', 'Assortment', 
       'Promo2', 'PromoInterval', 'State',   
       'Events',  'CompetitionMonthsOpen', 
       'Promo2Weeks',
       'SchoolHoliday_bw','StateHoliday_bw', 'Promo_bw', 'SchoolHoliday_fw', 'StateHoliday_fw','Promo_fw', 
       'SchoolHoliday_DaySum', 'StateHoliday_DaySum', 'Promo_DaySum', 
       'SchoolHoliday_DayCount', 'StateHoliday_DayCount', 'Promo_DayCount']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
            'Max_Humidity','Mean_Humidity', 'Min_Humidity',
            'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h','Precipitationmm','CloudCover',
            'trend', 'trend_DE', 'CompetitionDaysOpen', 'Promo2Days',
            'AfterSchoolHoliday', 'BeforeSchoolHoliday', 'AfterStateHoliday',
            'BeforeStateHoliday', 'AfterPromo', 'BeforePromo']

dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()",No,5,10.0
"test_df['Date'].min(), test_df['Date'].max(), len(test_df)",No,5,40.0
"cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
valid_idx = range(cut) ; valid_idx",No,5,14.0
"train_df['Date'][0], train_df['Date'][cut] ",No,5,41.0
"procs=[FillMissing, Categorify, Normalize]

datalist = (TabularList.from_df(df, path=OUTPUT, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx=valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=PATH, cat_names=cat_vars, cont_names=cont_vars)))
data = datalist.databunch(bs=512)",No,3,13.0
defaults.device,No,5,84.0
"max_log_y = np.log(np.max(train_df['Sales'])*1.2)  # whether it is better to have +20% max sales need to be verified
y_range = torch.tensor([0, max_log_y], device=defaults.device)
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, 
                        y_range=y_range, metrics=exp_rmspe)",No,2,4.0
learn.to_fp16,No,2,4.0
"learn.data.batch_size
",No,5,23.0
"import fastai
fastai.__version__",No,5,22.0
"learn.lr_find(end_lr=100, wd=0.3)
learn.recorder.plot()",No,5,2.0
"learn.fit_one_cycle(5, 3e-3, wd=0.3)",No,5,7.0
learn.save('bs512_5ep_2e-3_wd0.3'),No,5,50.0
"learn.fit_one_cycle(5, 1e-3, wd=0.3)",No,5,7.0
learn.recorder.plot_losses(),No,5,35.0
learn.save('bs512_2_5ep_1e-3_wd0.3'),No,5,50.0
"learn.fit_one_cycle(5, 3e-4, wd=0.3)
learn.recorder.plot_losses()",No,4,7.0
learn.save('bs512_3_5ep_3e-4_wd0.3'),No,5,50.0
"data = datalist.databunch(bs=128)
learn.data = data
learn.data.batch_size",No,4,23.0
"learn.fit_one_cycle(5, 1e-3, wd=0.2)
learn.recorder.plot_losses()",No,3,7.0
learn.save('bs128_5ep_1e-3_wd0.2'),No,5,50.0
"learn.fit_one_cycle(5, 1e-3, wd=0.1)
learn.recorder.plot_losses()",No,4,7.0
"learn.fit_one_cycle(20, 1e-3, wd=0.1)
learn.recorder.plot_losses()",No,4,7.0
"
learn.save('last')",No,5,50.0
"test_preds=learn.get_preds(DatasetType.Test)
test_df[""Sales""]=np.exp(test_preds[0].data).numpy().T[0]
test_df[[""Id"",""Sales""]]=test_df[[""Id"",""Sales""]].astype(""int"")
test_df[[""Id"",""Sales""]].to_csv(""rossmann_submission.csv"",index=False)",No,4,25.0
"import pandas as pd
from pandas import DataFrame as df
import numpy as np",No,5,22.0
"path= ""../input/rossmann-store-sales/""",No,5,77.0
"store_data=pd.read_csv(path+""store.csv"")
train_data=pd.read_csv(path+""train.csv"")
test_data=pd.read_csv(path+""test.csv"")
store_data=store_data.fillna(-1)


print(store_data.keys(),""\
"",train_data.keys())
'",No,4,45.0
"months_dict={
     ""-1"":0,  ""Jan"":1,  ""Feb"":2,    ""Mar"":3,     ""Apr"":4,     ""May"":5,     ""Jun"":6,
     ""Jul"":7, ""Aug"":8,  ""Sept"":9,   ""Oct"":10,    ""Nov"":11,    ""Dec"":12
     }
characters_dict={
    ""a"":0, ""b"":1, ""c"":2, ""d"":3, ""e"":4, ""f"":5, ""g"":6, ""h"":7, ""i"":8, ""j"":9,""k"":10,""l"":11,""m"":12,""n"":13, 
    ""o"":14, ""p"":15, ""q"":16, ""r"":17, ""s"":18, ""t"":19, ""u"":20, ""v"":21, ""w"":22, ""x"":23, ""y"":24,""z"":25,
}


def pre_process_store_data(store_data,months_dict,characters_dict):
    
    store_data_numpy = store_data.to_numpy()
    new_store_data_list=[]
    
    for idx,(Store, StoreType,Assortment, CompetitionDistance, \\
        CompetitionOpenSinceMonth, CompetitionOpenSinceYear, Promo2, \\
        Promo2SinceWeek , Promo2SinceYear, PromoInterval) in enumerate(store_data_numpy):
               
        #store type embedding
        StoreType = characters_dict[StoreType]

        #store type Assortment
        Assortment = characters_dict[Assortment]
        
        # store type PromoInterval
        sales_month_vector = np.zeros(len(months_dict.keys()))

        promo_months = str(PromoInterval).split("","")
        month_in_numbers = [months_dict[a] for a in promo_months]
        for i in month_in_numbers:
            sales_month_vector[i]=1;
        PromoInterval = sales_month_vector

        # make a new array with replaced embeddings
        temp = [Store, StoreType,Assortment, CompetitionDistance, \\
        CompetitionOpenSinceMonth, CompetitionOpenSinceYear, Promo2, \\
        Promo2SinceWeek , Promo2SinceYear] + list(PromoInterval)
        
        new_store_data_list.append(temp)
        # new_store_data_list=np.array(new_store_data_list)
    
    return new_store_data_list

def pre_process_train_data(train_data):
    train_data = train_data.to_numpy()
    state_holiday_dict={
        '0':0,
        'a':1,
        'b':2,
        'c':3
    }
    
    new_store_data_list=[]
    sales=[]
    for idx , (Store, DayOfWeek, Date, Sales, Customers, \\
               _open, Promo,StateHoliday, SchoolHoliday) in enumerate(train_data):         
        
        #splitting date
        Date = Date.split(""-"")
        
        #processing StateHoliday
        StateHoliday = state_holiday_dict[str(StateHoliday)]
        
        if _open==0:
            # print(_open)
            continue
        #new_store_data_list.append([Store, DayOfWeek ,int(Date[0]),int(Date[1]),int(Date[2]),\\
        #                            Sales, Customers, _open, Promo,StateHoliday, SchoolHoliday])
        new_store_data_list.append([Store, DayOfWeek ,int(Date[0]),int(Date[1]),int(Date[2]),\\
                                    _open, Promo,StateHoliday, SchoolHoliday])
        
        sales.append(Sales)
    return new_store_data_list ,sales


def connect_store_data_with_train_data(preprocessed_train_data,preprocessed_store_data):
    
    preprocessed_dataset=[]
    for train_instance in preprocessed_train_data:

        store_id = train_instance[0]-1
        store_instance = preprocessed_store_data[store_id]

        train_instance = store_instance + train_instance[:]
        # train_instance =  train_instance[1:]

        preprocessed_dataset.append(train_instance)


    preprocessed_dataset=np.array(preprocessed_dataset,dtype=np.float32)
    
    return preprocessed_dataset

    '",Yes,2,14.0
"
preprocessed_store_data = pre_process_store_data(store_data,months_dict,characters_dict)

preprocessed_train_data, preprocessed_train_labels = pre_process_train_data(train_data) 

preprocessed_dataset = connect_store_data_with_train_data(preprocessed_train_data,preprocessed_store_data)
",No,2,14.0
"import pandas as pd
from pandas import DataFrame as df
import numpy as np

store_data=pd.read_csv(path+""store.csv"",parse_dates=[3])
train_data=pd.read_csv(path+""train.csv"",parse_dates=[2])
test_data=pd.read_csv(path+""test.csv"",parse_dates=[3])
print(store_data.keys(),""\
"",train_data.keys())

train = pd.merge(train_data, store_data, on='Store')
test = pd.merge(test_data, store_data, on='Store')
print(train.keys(),""\
"",test.keys())

preprocessed_dataset=[]
preprocessed_train_labels=[]


'",Yes,3,45.0
"print(train.Store.isnull().sum(),train.DayOfWeek.isnull().sum(),train.Date.isnull().sum(),train.Open.isnull().sum(),\\
      train.Promo.isnull().sum(),train.StateHoliday.isnull().sum(),train.SchoolHoliday.isnull().sum(),train.StoreType.isnull().sum(),\\
      train.Assortment.isnull().sum(),train.CompetitionDistance.isnull().sum(),train.CompetitionOpenSinceMonth.isnull().sum(),train.CompetitionOpenSinceYear.isnull().sum(),\\
      train.Promo2.isnull().sum(),train.Promo2SinceWeek.isnull().sum(),train.Promo2SinceYear.isnull().sum(),train.PromoInterval.isnull().sum())


train=train.fillna(0)
# test=test.fillna(0)
# test=test.fillna(0)


print(train.Store.isnull().sum(),train.DayOfWeek.isnull().sum(),train.Date.isnull().sum(),train.Open.isnull().sum(),\\
      train.Promo.isnull().sum(),train.StateHoliday.isnull().sum(),train.SchoolHoliday.isnull().sum(),train.StoreType.isnull().sum(),\\
      train.Assortment.isnull().sum(),train.CompetitionDistance.isnull().sum(),train.CompetitionOpenSinceMonth.isnull().sum(),train.CompetitionOpenSinceYear.isnull().sum(),\\
      train.Promo2.isnull().sum(),train.Promo2SinceWeek.isnull().sum(),train.Promo2SinceYear.isnull().sum(),train.PromoInterval.isnull().sum())

'",No,4,17.0
"print(test.Store.isnull().sum(),test.DayOfWeek.isnull().sum(),test.Date.isnull().sum(),test.Open.isnull().sum(),\\
      test.Promo.isnull().sum(),test.StateHoliday.isnull().sum(),test.SchoolHoliday.isnull().sum(),test.StoreType.isnull().sum(),\\
      test.Assortment.isnull().sum(),test.CompetitionDistance.isnull().sum(),test.CompetitionOpenSinceMonth.isnull().sum(),test.CompetitionOpenSinceYear.isnull().sum(),\\
      test.Promo2.isnull().sum(),test.Promo2SinceWeek.isnull().sum(),test.Promo2SinceYear.isnull().sum(),test.PromoInterval.isnull().sum())

test[""Open""]=test[""Open""].fillna(0)
test=test.fillna(0)

print(test.Store.isnull().sum(),test.DayOfWeek.isnull().sum(),test.Date.isnull().sum(),test.Open.isnull().sum(),\\
      test.Promo.isnull().sum(),test.StateHoliday.isnull().sum(),test.SchoolHoliday.isnull().sum(),test.StoreType.isnull().sum(),\\
      test.Assortment.isnull().sum(),test.CompetitionDistance.isnull().sum(),test.CompetitionOpenSinceMonth.isnull().sum(),test.CompetitionOpenSinceYear.isnull().sum(),\\
      test.Promo2.isnull().sum(),test.Promo2SinceWeek.isnull().sum(),test.Promo2SinceYear.isnull().sum(),test.PromoInterval.isnull().sum())
'",No,4,17.0
"mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
train.StoreType.replace(mappings, inplace=True)
train.Assortment.replace(mappings, inplace=True)
train.StateHoliday.replace(mappings, inplace=True)

test.StoreType.replace(mappings, inplace=True)
test.Assortment.replace(mappings, inplace=True)
test.StateHoliday.replace(mappings, inplace=True)
",No,5,20.0
"
train['Year'] = train.Date.dt.year
train['Month'] = train.Date.dt.month
train['Day'] = train.Date.dt.day
train['DayOfWeek'] = train.Date.dt.dayofweek
train['WeekOfYear'] = train.Date.dt.weekofyear


test['Year'] = test.Date.dt.year
test['Month'] = test.Date.dt.month
test['Day'] = test.Date.dt.day
test['DayOfWeek'] = test.Date.dt.dayofweek
test['WeekOfYear'] = test.Date.dt.weekofyear",No,5,8.0
"
train['CompetitionOpen'] = 12 * (train.Year - train.CompetitionOpenSinceYear) +         (train.Month - train.CompetitionOpenSinceMonth)
train['PromoOpen'] = 12 * (train.Year - train.Promo2SinceYear) +         (train.WeekOfYear - train.Promo2SinceWeek) / 4.0
train['CompetitionOpen'] = train.CompetitionOpen.apply(lambda x: x if x > 0 else 0)        
train['PromoOpen'] = train.PromoOpen.apply(lambda x: x if x > 0 else 0)


test['CompetitionOpen'] = 12 * (test.Year - test.CompetitionOpenSinceYear) +         (test.Month - test.CompetitionOpenSinceMonth)
test['PromoOpen'] = 12 * (test.Year - test.Promo2SinceYear) +         (test.WeekOfYear - test.Promo2SinceWeek) / 4.0
test['CompetitionOpen'] = test.CompetitionOpen.apply(lambda x: x if x > 0 else 0)        
test['PromoOpen'] = test.PromoOpen.apply(lambda x: x if x > 0 else 0)",No,5,8.0
"month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
train['monthStr'] = train.Month.map(month2str)
train.loc[train.PromoInterval == 0, 'PromoInterval'] = ''
train['IsPromoMonth'] = 0
for interval in train.PromoInterval.unique():
    if interval != '':
        for month in interval.split(','):
            train.loc[(train.monthStr == month) & (train.PromoInterval == interval), 'IsPromoMonth'] = 1


month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
test['monthStr'] = test.Month.map(month2str)
test.loc[test.PromoInterval == 0, 'PromoInterval'] = ''
test['IsPromoMonth'] = 0
for interval in test.PromoInterval.unique():
    if interval != '':
        for month in interval.split(','):
            test.loc[(test.monthStr == month) & (test.PromoInterval == interval), 'IsPromoMonth'] = 1
",No,4,20.0
"train.keys(),test.keys()",No,5,40.0
"train.drop(['Date','Customers','Open','PromoInterval','monthStr'],axis=1,inplace =True)
test.drop(['Date','Open','PromoInterval','monthStr'],axis=1,inplace =True)


# train = train[train.Sales != 0]

ho_xtrain = train.drop(['Sales'],axis=1 )
ho_ytrain = train.Sales

ho_xtest=test
ho_xtest=ho_xtest.sort_values(by=['Id'])
# ho_xtest = test.drop(['Sales'],axis=1 )
# ho_ytest = test.Sales",No,4,10.0
"ho_xtest
",No,5,41.0
"
ho_xtest.keys() , ho_xtrain.keys()",No,5,40.0
"
preprocessed_dataset=ho_xtrain.to_numpy()

#preprocessed_train_labels=np.log1p(ho_ytrain.to_numpy()+1)


preprocessed_train_labels=(ho_ytrain.to_numpy()+1)/1000

preprocessed_test_dataset=ho_xtest.to_numpy()
# preprocessed_tr_labels=np.log1p(ho_ytest.to_numpy())


# preprocessed_train_labels=ho_ytrain.to_numpy()",No,5,16.0
"
preprocessed_test_dataset=ho_xtest.to_numpy()
",No,4,21.0
"from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( preprocessed_dataset, preprocessed_train_labels, test_size=0.2)

X_train=np.expand_dims(X_train,axis=-1)
X_test=np.expand_dims(X_test,axis=-1)
y_train=np.array(y_train)
y_test=np.array(y_test)
y_train=y_train
y_test=y_test
print(X_train.shape)
y_train.max(),y_test.max()


",No,4,21.0
"def rmspe(y_true, y_pred):
    '''
    RMSPE calculus to use during training phase
    '''
    return K.sqrt(K.mean(K.square(((y_true)  - (y_pred) ) / (y_true)), axis=-1))

def rmse(y_true, y_pred):
    '''
    RMSE calculus to use during training phase
    '''
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


def rmspe_val(y_true, y_pred):
    '''
    RMSPE calculus to validate evaluation metric about the model
    '''
    return np.sqrt(np.mean(np.square(((y_true) - (y_pred) ) / (y_true)), axis=0))[0]

",No,5,84.0
"from keras.utils.np_utils import to_categorical
from keras.models import Model, Sequential, model_from_json
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding,BatchNormalization,Input,Add,Concatenate
from keras.initializers import RandomNormal, Constant, he_normal
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D,Dense, Dropout, Flatten, Reshape, GlobalAveragePooling1D
import keras
from keras import backend as K
import tensorflow as tf

def model1():
    initializer = he_normal()

    dilation_rate=1
    bn=BatchNormalization


    inp=Input(shape=(X_train.shape[1],1))
    
    x1=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(inp))    
    x2=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x1))
    x2= Concatenate()([x1,x2])
    x3=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x2))
    x3=Concatenate()([x1,x2,x3])

    x=bn()(Conv1D(50, kernel_size=1, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x3))

    x=MaxPooling1D(2)(x)
    
    x3=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x))
    x4=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x3))
    x4= Concatenate()([x3,x4])
    x5=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x4))

    x=Concatenate()([x3,x4,x5])

    x=bn()(Conv1D(100, kernel_size=1, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x))

    x=GlobalAveragePooling1D()(x)

    x=Dense(500, activation=""linear"")(x)
    y=Dense(1)(x)

    model= Model(inputs=inp, outputs= y)

    adam = Adam(lr=1e-3)
    model.compile(loss=""mae"", optimizer=adam, metrics=[rmspe,""mse"",""mae"",rmse])
    # Compile model
    return model
    # model_m.compile(loss=""mae"", optimizer=adam, metrics=[rmspe,""mae"",""mse"",rmse])


model_m=model1()

print('Build model...')
model_m.summary()'",No,3,4.0
"
batch_size=80000
nb_epoch=400

print('Fit model...')
filepath=""weights_rossmann.best.hdf5""
checkpoint = ModelCheckpoint(filepath, monitor='val_rmspe', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

log = model_m.fit(X_train, y_train,
          validation_data=(X_test,y_test), batch_size=batch_size ,epochs=nb_epoch, shuffle=True, callbacks=callbacks_list)'",No,4,7.0
"model_m.load_weights(filepath)
preprocessed_test_dataset

ypred=model_m.predict(np.expand_dims(preprocessed_test_dataset[:,1:],axis=-1))

# results=np.concatenate([np.expand_dims(preprocessed_test_dataset[:,0],axis=-1),np.expm1(ypred)-1],axis=-1)

results=np.concatenate([np.expand_dims(preprocessed_test_dataset[:,0],axis=-1),ypred*1000],axis=-1)",No,3,48.0
"import csv
with open('submission.csv', mode='w') as csv_file:
    fieldnames = ['Id', 'Sales']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for i in results:
        print(i)
        writer.writerow({'Id':i[0], 'Sales': max(0,i[1])})

    #writer.writerow({'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'})
    #writer.writerow({'emp_name': 'Erica Meyers', 'dept': 'IT', 'birth_month': 'March'})
",No,5,25.0
"for a,b in zip(y_test,X_test):
    if a==0:
        print(a,b[0],b[1],b[2],b[3],b[4],b[5],b[6],b[7])


#Store, DayOfWeek ,int(Date[0]),int(Date[1]),int(Date[2]), Open, Promo,StateHoliday, SchoolHoliday",No,5,53.0
!ls,No,3,88.0
"import pandas as pd
from pandas import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np",No,5,22.0
"df = pd.read_csv('../input/rossmann-store-sales/train.csv', parse_dates = ['Date'], low_memory = False)
df.head()",No,4,45.0
"df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')",No,5,16.0
"df['Hour'] = df['Date'].dt.hour
df['Day_of_Month'] = df['Date'].dt.day
df['Day_of_Week'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month",No,5,8.0
"print(df['Date'].min())
print(df['Date'].max())",No,5,40.0
"test = pd.read_csv('../input/rossmann-store-sales/test.csv', parse_dates = True, low_memory = False)
test.head()",No,4,45.0
"test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')",No,5,16.0
"test['Hour'] = test['Date'].dt.hour
test['Day_of_Month'] = test['Date'].dt.day
test['Day_of_Week'] = test['Date'].dt.dayofweek
test['Month'] = test['Date'].dt.month",No,5,8.0
"print(test['Date'].min())
print(test['Date'].max())",No,5,40.0
"sns.pointplot(x='Month', y='Sales', data=df)",No,5,81.0
"sns.pointplot(x='Day_of_Week', y='Sales', data=df)",No,5,75.0
"sns.countplot(x = 'Day_of_Week', hue = 'Open', data = df)
plt.title('Store Daily Open Countplot')",No,5,75.0
"sns.pointplot(x='Day_of_Month', y='Sales', data=df)",No,5,75.0
"df['SalesPerCustomer'] = df['Sales']/df['Customers']
df['SalesPerCustomer'].describe()",No,4,40.0
df.Open.value_counts(),No,5,72.0
np.sum([df['Sales'] == 0]),No,5,72.0
"#drop closed stores and stores with zero sales
df = df[(df[""Open""] != 0) & (df['Sales'] != 0)]'",No,5,14.0
"store = pd.read_csv('../input/rossmann-store-sales/store.csv')
store.head(30)",No,4,45.0
store.isnull().sum(),No,5,39.0
"store['CompetitionDistance'] = store['CompetitionDistance'].fillna(store['CompetitionDistance'].max())
store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0]) #try 0
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0]) #try 0
store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0) #try 0
store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0]) #try 0
store['PromoInterval'] = store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0]) #try 0
store.head()",No,5,17.0
"df_store = pd.merge(df, store, how = 'left', on = 'Store')
df_store.head()",No,4,32.0
df_store.groupby('StoreType')['Sales'].describe(),No,5,60.0
"df_store.groupby('StoreType')['Customers', 'Sales'].sum()",No,5,60.0
"#sales trends
sns.catplot(data = df_store, x = 'Month', y = ""Sales"", 
               col = 'StoreType', # per store type in cols
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo', # per promo in the store in rows
               color = 'c') '",No,5,75.0
"#customer trends
sns.catplot(data = df_store, x = 'Month', y = ""Customers"", 
               col = 'StoreType', # per store type in cols
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo', # per promo in the store in rows
               color = 'c')'",No,5,75.0
"#sales per customer
sns.catplot(data = df_store, x = 'Month', y = ""SalesPerCustomer"", 
               col = 'StoreType', # per store type in cols
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo', # per promo in the store in rows
               color = 'c')'",No,5,75.0
"sns.catplot(data = df_store, x = 'Month', y = ""Sales"", 
               col = 'DayOfWeek', # per store type in cols
               palette = 'plasma',
               hue = 'StoreType',
               row = 'StoreType', # per store type in rows
               color = 'c') '",No,5,75.0
"#stores open on sunday
df_store[(df_store.Open == 1) & (df_store.DayOfWeek == 7)]['Store'].unique()",No,5,57.0
"sns.catplot(data = df_store, x = 'DayOfWeek', y = ""Sales"", 
               col = 'Promo', 
               row = 'Promo2',
               hue = 'Promo2',
               palette = 'RdPu') '",No,5,75.0
"df_store['StateHoliday'] = df_store['StateHoliday'].map({'0':0 , 0:0 , 'a':1 , 'b':2 , 'c':3})
df_store['StateHoliday'] = df_store['StateHoliday'].astype(int)",No,4,20.0
"df_store['StoreType'] = df_store['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4})
df_store['StoreType'] = df_store['StoreType'].astype(int)",No,4,20.0
df_store.isnull().sum(),No,5,39.0
"df_store['Assortment'] = df_store['Assortment'].map({'a':1 , 'b':2 , 'c':3})
df_store['Assortment'] = df_store['Assortment'].astype(int)",No,4,20.0
"df_store['PromoInterval'] = df_store['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3})
df_store['PromoInterval'] = df_store['PromoInterval'].astype(int)",No,4,20.0
"df_store.to_csv('df_merged.csv', index=False)",No,5,25.0
len(df_store),No,5,58.0
"test = pd.merge(test, store, how = 'left', on = 'Store')
test.head()",No,4,32.0
"test.fillna(method='ffill', inplace=True)",No,5,17.0
"test['StateHoliday'] = test['StateHoliday'].map({'0':0 , 0:0 , 'a':1 , 'b':2 , 'c':3})
test['StateHoliday'] = test['StateHoliday'].astype(int)
test['StoreType'] = test['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4})
test['StoreType'] = test['StoreType'].astype(int)
test['Assortment'] = test['Assortment'].map({'a':1 , 'b':2 , 'c':3})
test['Assortment'] = test['Assortment'].astype(int)
test['PromoInterval'] = test['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3})
test['PromoInterval'] = test['PromoInterval'].astype(int)",No,4,20.0
"test.to_csv('test_merged.csv', index=False)",No,5,25.0
"test = test.drop(['Id','Date'],axis=1)",No,5,10.0
"X = df_store.drop(['Date','Sales','Customers', 'SalesPerCustomer'],1)
#Transform Target Variable
y = np.log1p(df_store['Sales'])

from sklearn.model_selection import train_test_split
X_train , X_val , y_train , y_val = train_test_split(X, y , test_size=0.30 , random_state = 1 )",No,3,21.0
"X_train.shape, X_val.shape, y_train.shape, y_val.shape",No,5,58.0
"from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=10, n_estimators=200, random_state=42)
gbrt.fit(X_train, y_train)
print(gbrt.score(X_train, y_train))",Yes,3,7.0
y_pred = gbrt.predict(X_val),No,5,48.0
"from sklearn.metrics import r2_score, mean_squared_error
print(r2_score(y_val , y_pred))
print(np.sqrt(mean_squared_error(y_val , y_pred)))",No,5,49.0
"df1 = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
df1.head(25)",No,4,12.0
"test_pred=gbrt.predict(test[X.columns])
test_pred_inv=np.exp(test_pred)-1",No,5,48.0
test_pred_inv,No,5,41.0
"#make submission df
prediction = pd.DataFrame(test_pred_inv)
submission = pd.read_csv('../input/rossmann-store-sales/sample_submission.csv')
prediction_df = pd.concat([submission['Id'], prediction], axis=1)
prediction_df.columns=['Id','Sales']
prediction_df.to_csv('Sample_Submission.csv', index=False)",No,3,25.0
prediction_df.head(),No,5,41.0
"# 
import pandas as pd
import numpy as np
import xgboost as xgb

import missingno as msno
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline'",No,5,23.0
"b""# \ntrain = pd.read_csv('../input/rossmann-store-sales/train.csv')\ntest = pd.read_csv('../input/rossmann-store-sales/test.csv')\nstore = pd.read_csv('../input/rossmann-store-sales/store.csv')""",No,5,45.0
"train.info(), test.info(), store.info()",No,5,40.0
"fig = plt.figure(figsize=(16,6))

ax1 = fig.add_subplot(121)
ax1.set_xlabel('Sales')
ax1.set_ylabel('Count')
ax1.set_title('Sales of Closed Stores')
plt.xlim(-1,1)
train.loc[train.Open==0].Sales.hist(align='left')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Sales')
ax2.set_ylabel('PDF')
ax2.set_title('Sales of Open Stores')
sns.distplot(train.loc[train.Open!=0].Sales)

print('The skewness of Sales is {}'.format(train.loc[train.Open!=0].Sales.skew()))",No,5,33.0
"train = train.loc[train.Open != 0]
train = train.loc[train.Sales > 0].reset_index(drop=True)",No,5,14.0
"# train
train[train.isnull().values==True]'",No,5,14.0
"# test
test[test.isnull().values==True]'",No,5,14.0
"# store
msno.matrix(store)'",No,5,34.0
"# test
test.fillna(1,inplace=True)

# CompetitionDistance
store.CompetitionDistance = store.CompetitionDistance.fillna(store.CompetitionDistance.median())

# 0
store.fillna(0,inplace=True)'",No,5,17.0
"b""# \ntrain = pd.merge(train, store, on='Store')\ntest = pd.merge(test, store, on='Store')""",No,5,32.0
"import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import math
import sklearn.preprocessing as skpe
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import sklearn.ensemble as sken
import sklearn.linear_model as lm
import seaborn as sns
import matplotlib.pyplot as plt",No,5,22.0
"# Reading files
path=""../input/rossmann-store-sales/train.csv""
train=pd.read_csv(path)
print(train.shape)
train.head()",Yes,4,45.0
"path1=""../input/rossmann-store-sales/test.csv""
test=pd.read_csv(path1)
print(test.shape)
test.head()",Yes,4,45.0
"path2=""../input/rossmann-store-sales/store.csv""
store_df=pd.read_csv(path2)
print(store_df.shape)
store_df.head()",Yes,4,45.0
"train.info()
print(""----------------------------------------------"")

store_df.info()
print(""----------------------------------------------"")

test.info()",No,5,40.0
"# Adding new variable
train['Sales_per_customer']=train['Sales']/train['Customers']
train['Sales_per_customer'].describe() # An average of 9.49$ is earned from a customer at a particular store",No,4,8.0
"fig, ax1 = plt.subplots(figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek', data=train,palette=""husl"", ax=ax1) # This indicates that there are some stores which opens mostly on Sundays while some are closed on Sundays '",No,5,33.0
"# Date

# Create Year and Month columns
train['Year']  = train['Date'].apply(lambda x: int(str(x)[:4]))
train['Month'] = train['Date'].apply(lambda x: int(str(x)[5:7]))

test['Year']  = test['Date'].apply(lambda x: int(str(x)[:4]))
test['Month'] = test['Date'].apply(lambda x: int(str(x)[5:7]))

# Assign Date column to Date(Year-Month) instead of (Year-Month-Day)
train['Date'] = train['Date'].apply(lambda x: (str(x)[:7]))
test['Date']     = test['Date'].apply(lambda x: (str(x)[:7]))

# group by date and get average sales, and percent change
avg_sales    = train.groupby('Date')[""Sales""].mean()
pct_change_sales = train.groupby('Date')[""Sales""].sum().pct_change()

fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))

# plot average sales over time(year-month)
ax1 = avg_sales.plot(legend=True,ax=axis1,marker='o',title=""Average Sales"")
ax1.set_xticks(range(len(avg_sales)))
ax1.set_xticklabels(avg_sales.index.tolist(), rotation=90)

# plot precent change for sales over time(year-month)
ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap=""summer"",title=""Sales Percent Change"")'",Yes,4,8.0
"# Plot average sales and customers over years
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Year', y='Sales', data=train, ax=axis1)
sns.barplot(x='Year', y='Customers', data=train, ax=axis2)",No,5,75.0
"# Plot average sales and customers over days of week
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='DayOfWeek', y='Sales', data=train, ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=train, ax=axis2) ",No,5,75.0
"# Plot average sales and customers over months
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Month', y='Sales', data=train, ax=axis1)
sns.barplot(x='Month', y='Customers', data=train, ax=axis2)",No,5,75.0
"# Plot average sales and customers with/without promo
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo', y='Sales', data=train, ax=axis1)
sns.barplot(x='Promo', y='Customers', data=train, ax=axis2)",No,5,33.0
"b""def build_features(features, data):\n\n    # \n    features.extend(['Store','CompetitionDistance','CompetitionOpenSinceMonth','StateHoliday','StoreType','Assortment',\n                     'SchoolHoliday','CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])\n    \n    # https://blog.csdn.net/aicanghai_smile/article/details/80987666\n    \n    # dt\n    features.extend(['Year','Month','Day','DayOfWeek','WeekOfYear'])\n    data['Year'] = data.Date.dt.year\n    data['Month'] = data.Date.dt.month\n    data['Day'] = data.Date.dt.day\n    data['DayOfWeek'] = data.Date.dt.dayofweek\n    data['WeekOfYear'] = data.Date.dt.weekofyear\n    \n    # 'CompetitionOpen'\n    # 'PromoOpen'\n    # \n    features.extend(['CompetitionOpen','PromoOpen'])\n    data['CompetitionOpen'] = 12*(data.Year-data.CompetitionOpenSinceYear) + (data.Month-data.CompetitionOpenSinceMonth)\n    data['PromoOpen'] = 12*(data.Year-data.Promo2SinceYear) + (data.WeekOfYear-data.Promo2SinceWeek)/4.0\n    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)        \n    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)\n    \n    # 'IsPromoMonth'10\n    features.append('IsPromoMonth')\n    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\n    data['monthStr'] = data.Month.map(month2str)\n    data.loc[data.PromoInterval==0, 'PromoInterval'] = ''\n    data['IsPromoMonth'] = 0\n    for interval in data.PromoInterval.unique():\n        if interval != '':\n            for month in interval.split(','):\n                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1\n    \n    # \n    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}\n    data.StoreType.replace(mappings, inplace=True)\n    data.Assortment.replace(mappings, inplace=True)\n    data.StateHoliday.replace(mappings, inplace=True)\n    data['StoreType'] = data['StoreType'].astype(int)\n    data['Assortment'] = data['Assortment'].astype(int)\n    data['StateHoliday'] = data['StateHoliday'].astype(int)""",No,3,20.0
"b""# Date\ntrain.Date = pd.to_datetime(train.Date, errors='coerce')\ntest.Date = pd.to_datetime(test.Date, errors='coerce')\n\n# features\nfeatures = []\n\n# traintest\nbuild_features(features, train)\nbuild_features([], test)\n\n# \nprint(features)""",No,3,16.0
"# Rmspe
# https://www.kaggle.com/justdoit/xgboost-in-python-with-rmspe

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y-yhat)**2))
    return rmspe

def rmspe_xg(yhat, y):
    y = y.get_label()
    y = np.expm1(y)
    yhat = np.expm1(yhat)
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y-yhat)**2))
    return ""rmspe"", rmspe

def neg_rmspe(yhat, y):
    y = np.expm1(y)
    yhat = np.expm1(yhat)
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y-yhat)**2))
    return -rmspe'",No,5,84.0
"from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import make_scorer

from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=2)

cv_sets = ShuffleSplit(n_splits=5, test_size=0.2)    
params = {'max_depth':range(10,40,2)}
scoring_fnc = make_scorer(neg_rmspe)

grid = GridSearchCV(regressor,params,scoring_fnc,cv=cv_sets)
grid = grid.fit(train[features], np.log1p(train.Sales))

DTR = grid.best_estimator_",No,4,6.0
"# 
DTR.get_params()'",No,5,79.0
"# 
submission = pd.DataFrame({""Id"": test[""Id""], ""Sales"": np.expm1(DTR.predict(test[features]))})
submission.to_csv(""benchmark.csv"", index=False)'",No,5,25.0
"b""# \nparams = {'objective': 'reg:linear',\n          'eta': 0.01,\n          'max_depth': 11,\n          'subsample': 0.5,\n          'colsample_bytree': 0.5,\n          'silent': 1,\n          'seed': 1\n          }\nnum_trees = 10000""",No,5,59.0
"b""# \nfrom sklearn.model_selection import train_test_split\n\nX_train, X_test = train_test_split(train, test_size=0.2, random_state=2)\n\ndtrain = xgb.DMatrix(X_train[features], np.log1p(X_train.Sales))\ndvalid = xgb.DMatrix(X_test[features], np.log1p(X_test.Sales))\ndtest = xgb.DMatrix(test[features])\n\nwatchlist = [(dtrain, 'train'),(dvalid, 'eval')]\ngbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=False)""",No,2,7.0
"# 
test_probs = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({""Id"": test[""Id""], ""Sales"": np.expm1(test_probs)})
submission.to_csv(""xgboost.csv"", index=False)'",Yes,2,48.0
"from fastai.tabular import *
from isoweek import Week
#import tarfile

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# path to external datasets
tar = tarfile.open('/kaggle/input/external-datasets/rossmann.tgz', ""r:gz"")'",No,4,88.0
"# place holders
path = ""/kaggle/input/rossmann-store-sales/""
base_path=""../output""",No,3,44.0
"# paths to kaggle data sets
train = ""/kaggle/input/rossmann-store-sales/train.csv""
test = ""/kaggle/input/rossmann-store-sales/test.csv""
store = ""/kaggle/input/rossmann-store-sales/store.csv""

# paths to external tar file datasets
store_states = tar.extractfile('store_states.csv')
state_names = tar.extractfile('state_names.csv')
googletrend = tar.extractfile('googletrend.csv')
weather = tar.extractfile('weather.csv')'",No,4,45.0
"# read in kaggle and external datasets as dataframes
table_names = [train, store, store_states, state_names, googletrend, weather, test]
tables = [pd.read_csv(fpath, low_memory=False) for fpath in table_names]
train, store, store_states, state_names, googletrend, weather, test = tables
len(train),len(test)",No,4,45.0
"print(train.shape)
train.head()",No,4,58.0
"print(test.shape)
test.head()",No,4,58.0
"print(store.shape)
store.head()",No,4,58.0
"print(store_states.shape)
store_states.head()",No,4,58.0
"print(googletrend.shape)
googletrend.head()",No,4,58.0
"print(weather.shape)
weather.head()",No,4,58.0
"print(train.StateHoliday.unique())
print(test.StateHoliday.unique())",No,5,57.0
"train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'",No,5,14.0
"def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on,
                     suffixes=("""",suffix))'",No,5,32.0
"weather = join_df(weather, state_names, ""file"", ""StateName"")
weather.head(3)",No,4,32.0
"googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]
googletrend.loc[googletrend.State=='NI', ""State""] = 'HB,NI''",No,4,78.0
googletrend.head(3),No,5,41.0
"def add_datepart(df, fldname, drop=True, time=False):
    ""Helper function that adds columns relevant to a date.""
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)'",No,2,8.0
"add_datepart(googletrend,""Date"", drop=False)
googletrend.head(3)",No,2,16.0
"# continue with all other tables
add_datepart(weather, ""Date"", drop=False)
add_datepart(train, ""Date"", drop=False)
add_datepart(test, ""Date"", drop=False)",No,3,16.0
"trend_de = googletrend[googletrend.file == 'Rossmann_DE']
trend_de.head(3)",No,4,14.0
"store = join_df(store, store_states, ""Store"")
len(store[store.State.isnull()])",No,4,32.0
"joined = join_df(train, store, ""Store"")
joined_test = join_df(test, store, ""Store"")
len(joined[joined.StoreType.isnull()]), len(joined_test[joined_test.StoreType.isnull()])",No,4,32.0
"# join the joined df with googletrend with [""State"",""Year"",""Week""] as the index
# this way the non matching day dates do not create issues.
joined = join_df(joined, googletrend, [""State"",""Year"", ""Week""])
joined_test = join_df(joined_test, googletrend, [""State"",""Year"", ""Week""])
len(joined[joined.trend.isnull()]),len(joined_test[joined_test.trend.isnull()])",No,4,32.0
"# now join the overal germany trend
joined = joined.merge(trend_de, 'left', [""Year"", ""Week""], suffixes=('', '_DE'))
joined_test = joined_test.merge(trend_de, 'left', [""Year"", ""Week""], suffixes=('', '_DE'))
len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])'",No,4,32.0
"# finally join the weather data
joined = join_df(joined, weather, [""State"",""Date""])
joined_test = join_df(joined_test, weather, [""State"",""Date""])
len(joined[joined.Mean_TemperatureC.isnull()]),len(joined_test[joined_test.Mean_TemperatureC.isnull()])",No,4,32.0
"# now we can drop duplicated columns
for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns: df.drop(c, inplace=True, axis=1)",No,5,10.0
"for df in (joined,joined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)",No,5,16.0
"for df in (joined, joined_test):
    df['CompetitionOpenSince'] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear,
                                                     month=df.CompetitionOpenSinceMonth, 
                                                     day=15))
    df['CompetitionDaysOpen'] = df.Date.subtract(df.CompetitionOpenSince).dt.days",No,4,16.0
"for df in (joined, joined_test):
    df.loc[df.CompetitionDaysOpen<0, ""CompetitionDaysOpen""] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, ""CompetitionDaysOpen""] = 0",No,4,20.0
"for df in (joined,joined_test):
    df[""CompetitionMonthsOpen""] = df[""CompetitionDaysOpen""]//30
    df.loc[df.CompetitionMonthsOpen>24, ""CompetitionMonthsOpen""] = 24
joined.CompetitionMonthsOpen.unique()",No,4,8.0
"for df in (joined, joined_test):
    df[""Promo2Since""] = pd.to_datetime(df.apply(
        lambda x: Week(x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1))
    df[""Promo2Days""] = df.Date.subtract(df['Promo2Since']).dt.days'",No,4,16.0
"for df in (joined,joined_test):
    df.loc[df.Promo2Days<0, ""Promo2Days""] = 0
    df.loc[df.Promo2SinceYear<1990, ""Promo2Days""] = 0
    df[""Promo2Weeks""] = df[""Promo2Days""]//7
    df.loc[df.Promo2Weeks<0, ""Promo2Weeks""] = 0
    df.loc[df.Promo2Weeks>25, ""Promo2Weeks""] = 25
    df.Promo2Weeks.unique()",No,4,8.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,5,88.0
"b""# This decoratore shows some info about function perormance\n# etc time,shpe changes,nan values\n\ndef info(function):\n    import datetime\n    def wrapper(data,*args,**kargs):\n        tic    = datetime.datetime.now()\n        result = function(data,*args,**kargs)\n        toc    = datetime.datetime.now()\n        print(function.__name__,' took ', toc-tic)\n        print('Shape: ',data.shape,' ----> ', result.shape)\n        print('NaN value: ', result.isna().sum()[result.isna().sum() != 0])\n        print('\\n')\n        return result\n    return wrapper""",No,5,53.0
"# let`s load datasets as usually

train  = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
test   = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
stores = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
sample = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')",No,5,45.0
"# Thank to notebooks we can definve evaluation metric

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe",No,5,84.0
"import seaborn as sns

fig,ax =plt.subplots(1,2,figsize = (20,10))
ins1 = ax[0].inset_axes([0.5,0.5,0.4,0.4])
ins2 = ax[1].inset_axes([0.7,0.7,0.2,0.2])

sns.distplot(train[train.Sales != 0].Sales,ax=ax[0],bins=100) 
sns.distplot(np.log1p(train[train.Sales != 0].Sales),ax=ins1,bins=100,color = 'red')


sns.boxplot(train[train.Sales != 0].Sales,ax=ax[1])
sns.boxplot(np.log1p(train[train.Sales != 0].Sales),ax=ins2)


# We see that sales values shpw positive skeew, it can be fixed by applying np.log1p (embedded plot)
# Also there are some outliers, lets define functions to perform transformation and outliers removal",No,5,33.0
"@info
def log_transf(df):
    # log transformation function to remove skeew
    df.Sales     = np.log1p(df.Sales)
    df.Customers = np.log1p(df.Customers)
    return df

@info
def remove_outliers(df,column='Sales'):
    # interquntile approach to remove outliers
    q1  = df[column].quantile(0.2)
    q3  = df[column].quantile(0.8)
    iqr = q3-q1
    iqr_lower = q1 - 1.5*iqr
    iqr_upper = q3 + 1.5*iqr
    
    df = df.loc[(df[column] > iqr_lower) & (df[column]< iqr_upper),:]
    return df",No,3,8.0
"@info
def timeseries_features(df):
    # move to datetime format
    df.Date = pd.to_datetime(df.Date)
    df = df.sort_values('Date').reset_index(drop = True)
    
    # derive regular for ml task time series features
    df['month']          = df.Date.dt.month
    df['dayofmonth']     = df.Date.dt.day
    df['dayofyear']      = df.Date.dt.dayofyear
    df['year']           = df.Date.dt.year
    df['is_weekday']     = df.DayOfWeek.apply(lambda x: 0 if x in (6,7) else 1)
    df['is_month_start'] = df.Date.dt.is_month_start.astype(int)
    df['is_month_end']   = df.Date.dt.is_month_end.astype(int)

    # also lets take into account holidays
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

    holidays = calendar().holidays(start = df.Date.min(), end = df.Date.max())
    df['is_holiday'] = df.Date.isin(holidays).astype(int)
    

    return df

@info
def clean_main(df):
    # drop days with 0 sales
    df = df.loc[df.Sales != 0,:].reset_index(drop = True)
    df = df.drop(['Open'],axis = 1)
    
    # beacus unique values contain mixed dtype array(['a', '0', 'b', 'c', 0], dtype=object)
    # also could be fixed during pandas importing
    df.StateHoliday = df.StateHoliday.astype(str)
    
    return df

@info
def clean_store(df):
    # lets drop columns with high content of nan values
    df.CompetitionDistance.fillna(df.CompetitionDistance.mean(),inplace = True)
    df.drop(['CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear'],axis = 1,inplace = True)
    
    import calendar
    
    # We have a list of promo monthes and we can derive usefull feature
    # presence or absence of promo
    
    # first create encoded dictionary Month:Number eg Feb:2
    month_dict = {v: k for k,v in enumerate(calendar.month_abbr)}
    del month_dict['']
    del month_dict['Sep']
    month_dict['NaN']   = 0 # assign absence of promo 0
    month_dict['Sept']  = 9 # There is no Sep

    # Secondly, we treat PromoInterval columns, making each row list instead of string  now we have smth like ['Feb','Mar','Sept']
    # and lets apply dictionary
    df.PromoInterval = df.PromoInterval.fillna('NaN')
    df.PromoInterval = df.PromoInterval.str.split(',')
    # Lastly we are applyin transformation
    df.PromoInterval = df.PromoInterval.apply(lambda x: [month_dict[value] for value in x if month_dict.get(value)])
    
    # Lets create new feature that us equal to number of promo monthes
    df['promo_len']  = df.PromoInterval.apply(lambda x: len(x))
    return df


# Pipeline for train file
train_prep = (train
             .copy()
             .pipe(log_transf)
             .pipe(remove_outliers)
             .pipe(timeseries_features)
             .pipe(clean_main)
             )

# Pipeline for store file
store_prep = (stores
             .copy()
             .pipe(clean_store)
             )
# Now we merge two
data_prep               = pd.merge(train_prep,store_prep,how='left',on='Store')

# Using our transformation in PromoInterval interval, we create binary new feature is_promo or not
data_prep['is_promo']   = data_prep.apply(lambda x: 1 if x['month'] in x['PromoInterval'] else 0,axis = 1)
data_prep               = data_prep.drop('PromoInterval',axis=1).reset_index(drop=True)",No,2,8.0
"# Here I would like to know what is rmspe score with th emost dumb approach

# we devide data on train and test
test_bs  = data_prep[data_prep.year == 2015]
train_bs = data_prep[data_prep.year  < 2015]

# I am going to use mean Sales grouped by store-month-day among previous years as predicted values for 2015
predict_bs = (train_bs
              .groupby(['Store','month','dayofmonth']).Sales.mean().reset_index().rename({'Sales':'predictions'},axis = 1)
              .merge(test_bs,how='right',on = ['Store','month','dayofmonth'])
              .fillna(train_bs.Sales.mean())
              .sort_values('Date')
              )

# Display baseline
print('Baseline to overcome = {:.2f}'.format(rmspe(np.expm1(predict_bs.Sales),np.expm1(predict_bs.predictions))))

# Let`s see how prediction looks like
fig,ax = plt.subplots(1,3,figsize = (30,10))

rnd_store = np.random.randint(min(predict_bs.Store),max(predict_bs.Store),3)

for idx,store in enumerate(rnd_store):
    
    ax[idx].plot(predict_bs[predict_bs.Store == store].Date,np.expm1(predict_bs[predict_bs.Store == store].Sales), color = 'blue'    ,label = 'Observed')
    ax[idx].plot(predict_bs[predict_bs.Store == store].Date,np.expm1(predict_bs[predict_bs.Store == store].predictions),color = 'red',label = 'Predicted')

    ax[idx].legend()
    ax[idx].set_title('Store '+str(store))
    
    
# It doesn`t look so bad",Yes,3,33.0
"# There are two few reasons to use mean (aka target) encoding
# We have 1115 stores, definetly there is correlation between store and sales
# We could perform leave stores as it is ----> not good for known reasons
# We could perform OneHotEncoding        ----> not goodm becaouse we will have 1115 new columns, mainly sparse
# We can do mean encoding, eg encode stores as mean/std/other of target
# I am going to use Customers to encode store, because we don`t have customers in test set
# Obviusly customers can be good feature

def mean_encoding(df,column,target,func = np.mean):
    
    # perform target encoding on column with some function
    
    enc_col_name = target+'_enc_'+func.__name__
    df_temp = (df
               .groupby(column)[target]
               .apply(func)
               .reset_index()
               .rename({target:enc_col_name},axis=1)
              )
    
    df = df.merge(df_temp,how='left',on = column)
        
    
    return df,df_temp

data_prep,dict_for_test = mean_encoding(data_prep,'Store','Customers',func = np.mean) ",No,1,16.0
"b""# also it is good to statistic\n\nfrom statsmodels.tsa.seasonal import seasonal_decompose\nfrom statsmodels.graphics.tsaplots import plot_acf,plot_pacf\nfrom statsmodels.tsa.stattools import adfuller\n\n\n# first lets check our data for stationarity\n\ncounter = 0\nfor store in data_prep.Store.unique():\n    df_store = data_prep.copy().loc[data_prep.Store == store,['Date','Sales']].set_index('Date')\n    # since we removed some dates, lets resample data on a daily basis and fillna with 0\n    df_store = df_store.resample('D').fillna('bfill')\n    adf = adfuller(df_store,regression='c', autolag='AIC')\n    \n    if adf[1] > 0.05:\n        print('Adfuller for store {} : p-value = {:.5f} > 5% -----> NON STATIONARY'.format(store,adf[1]*100))\n        counter+=1\n        # also we can use it as a feature\n        # Doesnt make sense becaause only ~3 of store are not statonary\n        \nprint('\\n {:.2f} % of stores are non stationary '.format(counter/len(data_prep.Store.unique())*100))\n\n# There is a chance to use traditional time series technique(ARIMA,SARIMAX, smothing) but i would ike to continue with ml""",Yes,2,22.0
"# lets check few random stores 

rnd_store = np.random.randint(min(data_prep.Store),max(data_prep.Store),3)

fig,ax = plt.subplots(3,2,figsize = (15,10))

for idx,store in enumerate(rnd_store):
    df_store = data_prep.copy().loc[data_prep.Store == store,['Date','Sales']].set_index('Date')
    df_store = df_store.resample('D').fillna('bfill')
    plot_acf(df_store,lags = 60,ax = ax[idx,0],label = store) 
    plot_pacf(df_store,lags = 60,ax = ax[idx,1], label = store)
    ax[idx,0].set_title('Autocorelation for store {}'.format(store))
    ax[idx,1].set_title('Partial Autocorelation for store {}'.format(store))
    plt.tight_layout()
    
    
# By running this part few times we can notice that almost for all stores there is hogh corelation with following lags:
# 1 14,28,42, 49
# Therefore lets use this values to create new features
# But we need to preduct 48 days in future, threre fore we cannot use something lower 48",No,3,33.0
"b""# finally lets check on nan and dublicated values\n \nprint('NaN summary\\n\\n',data_prep.isna().sum()/len(data_prep)*100,'\\n')\nprint('Number of absoulute    dublicates:',data_prep.duplicated().sum())\nprint('Number of Store - Date dublicates:',data_prep.duplicated(subset = ['Date','Store']).sum())""",No,3,39.0
"from pandas.plotting import scatter_matrix
import seaborn

corr = data_prep.corr()
plt.figure(figsize=(15,15))
seaborn.heatmap(corr)",No,5,80.0
"stores = np.random.randint(train.Store.min(),train.Store.max(),2)
plt.figure(figsize=(15,10))
for store in stores:
    plt.plot(data_prep.loc[(data_prep.Store == store) & (data_prep.year == 2013),'Date'],data_prep.loc[(data_prep.Store == store) & (data_prep.year == 2013),'Sales'],label = store)
    plt.legend()",No,5,75.0
"ohe_col = data_prep.select_dtypes('object').columns.tolist()+['Store','DayOfWeek','month']
num_col = data_prep.select_dtypes('float').columns.tolist()",No,2,8.0
"X = data_prep.drop(['Date','Sales','Customers','Store'],axis = 1)
y = data_prep.Sales


X_train,X_val = X.loc[X.year < 2015,:],X.loc[X.year == 2015,:]
y_train,y_val = y[:X_train.index[-1]+1], y[X_train.index[-1]+1:]",No,3,13.0
"from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

transformer = make_column_transformer(
    (StandardScaler(),['CompetitionDistance', 'Customers_enc_mean']),
    (OneHotEncoder(),['StateHoliday', 'StoreType', 'Assortment', 'DayOfWeek', 'month']),
    remainder = 'passthrough'
)


import xgboost as xgb

regressor = xgb.XGBRegressor(n_estimators = 200,
                             max_depth    =  10
                            )


pipeline = make_pipeline(transformer,
                         regressor)


pipeline.fit(X_train,y_train)


print('TRAIN RMSPE = ',rmspe(np.expm1(pipeline.predict(X_train)),np.expm1(y_train)))
print('VAL   RMSPE = ',rmspe(np.expm1(pipeline.predict(X_val)),np.expm1(y_val)))",No,3,7.0
"# We need to apply same transformation on test set as we did we train set

test_prep = (test
             .copy()
             .pipe(timeseries_features)
             .drop(['Open','Date'],axis=1)
             )

test_prep  = pd.merge(test_prep,store_prep,how='left',on='Store')
test_prep['is_promo']   = test_prep.apply(lambda x: 1 if x['month'] in x['PromoInterval'] else 0,axis = 1)
test_prep  = pd.merge(test_prep,dict_for_test,how='left',on='Store')
test_prep = test_prep.drop(['PromoInterval','Store'],axis=1).reset_index(drop=True)
",No,4,32.0
"test_id = test_prep.Id
test_prep.drop('Id',axis=1,inplace = True)
predict = np.expm1(pipeline.predict(test_prep)) # Remember to make inverse transformation",No,5,48.0
"sub = pd.DataFrame({'Id':test_id,'Sales':predict}).sort_values('Id').reset_index(drop=True)
sub.to_csv('submission.csv',index=False)",No,5,25.0
"columns = [""Date"", ""Store"", ""Promo"", ""StateHoliday"", ""SchoolHoliday""]",No,4,21.0
"df = train[columns].append(test[columns])
df.head(3)",No,4,21.0
"fld = 'SchoolHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')",No,4,9.0
"fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')",No,4,9.0
"fld = 'Promo'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')",No,4,9.0
df = df.set_index('Date'),No,5,53.0
"columns = ['SchoolHoliday', 'StateHoliday', 'Promo']",No,4,17.0
"for o in ['Before', 'After']:
    for p in columns:
        a = o+p
        df[a] = df[a].fillna(0).astype(int)",No,3,17.0
"bwd = df[['Store']+columns].sort_index().groupby(""Store"").rolling(7, min_periods=1).sum()'",No,4,60.0
"fwd = df[['Store']+columns].sort_index(ascending=False
                                      ).groupby(""Store"").rolling(7, min_periods=1).sum()'",No,4,60.0
"bwd.drop('Store',1,inplace=True)
bwd.reset_index(inplace=True)",No,5,10.0
"fwd.drop('Store',1,inplace=True)
fwd.reset_index(inplace=True)",No,5,10.0
df.reset_index(inplace=True),No,4,84.0
"df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])",No,5,32.0
"df.drop(columns,1,inplace=True)",No,5,10.0
"df[""Date""] = pd.to_datetime(df.Date)",No,5,16.0
"joined = join_df(joined, df, ['Store', 'Date'])",No,5,32.0
"joined_test = join_df(joined_test, df, ['Store', 'Date'])",No,5,32.0
joined = joined[joined.Sales!=0],No,5,14.0
"joined.reset_index(inplace=True)
joined_test.reset_index(inplace=True)",No,5,84.0
"pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)",No,5,23.0
"train_df = joined
test_df = joined_test",No,5,77.0
train_df.head().T,No,5,41.0
"print(test_df.shape)
test_df.head()",No,3,41.0
n = len(train_df); n,No,5,77.0
"idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance','Mean_Humidity']
small_cat_vars = ['Store','DayOfWeek','PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]",No,4,14.0
small_train_df.head(),No,5,41.0
small_test_df.head(),No,5,41.0
"categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)",No,5,20.0
small_train_df.PromoInterval.cat.categories,No,5,57.0
"# we convert to categories then add 1 to -1 (NaNs) to turn it to zero because you can not look up 1 in an embedding matrix
small_train_df['PromoInterval'].cat.codes[:5]",No,5,41.0
"fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)",No,5,17.0
"# find any missing values, create a column called ""_na"" and set it to True any time it is missing
# then replace the empty value with the median of CompetitionDistance because it needs to be a continues varaiable
small_train_df[small_train_df['CompetitionDistance_na'] == True]'",No,5,14.0
"len(train_df),len(test_df)",No,5,58.0
"# as seen above, create pre processers fill missing, categorify 
# and normalize (normalize: for any continous var subtract the mean and divide by std)
procs=[FillMissing, Categorify, Normalize]",No,5,77.0
"# name your category variables, keep some continues variables like ""day"" as cat because
# as a cat var it will create an embedding matrix and the different days of the month will create different behavors
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

# name your continues variables
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']'",No,4,14.0
"# dependant var
dep_var = 'Sales'

# the final df to pass in will be the cat_vars, cont_vars, dep_var, and date, date will be used to create the validation set, 
#it will be the same number of records at the end of the time period as the test set from kaggle
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()",No,3,14.0
"test_df['Date'].min(), test_df['Date'].max()",No,5,40.0
"cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut",No,5,14.0
valid_idx = range(cut),No,4,13.0
"# finally, lets look 
df[dep_var].head()",No,5,41.0
"# create databunch
data = (TabularList.from_df(df, path='.', cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())",No,5,12.0
"max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)",No,4,21.0
"# Learner
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
                       y_range=y_range, metrics=exp_rmspe)",No,4,7.0
learn.model,No,3,4.0
len(data.train_ds.cont_names),No,5,58.0
"learn.lr_find()
learn.recorder.plot()",No,5,35.0
"learn.fit_one_cycle(5, 1e-3, wd=0.2)",No,5,7.0
learn.save('1'),No,5,50.0
learn.recorder.plot_losses(skip_start=10000),No,5,35.0
learn.load('1');,No,5,30.0
"learn.fit_one_cycle(5, 3e-4)",No,5,7.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os",No,5,22.0
"# Loading data directly from CatBoost
from catboost.datasets import amazon

train, test = amazon()",No,3,21.0
"print(""Train shape: {}, Test shape: {}"".format(train.shape, test.shape))",No,5,58.0
train.head(5),No,5,41.0
test.head(5),No,5,41.0
train.apply(lambda x: len(x.unique())),No,5,54.0
"import itertools
target = ""ACTION""
col4train = [x for x in train.columns if x!=target]

col1 = 'ROLE_CODE'
col2 = 'ROLE_TITLE'

pair = len(train.groupby([col1,col2]).size())
single = len(train.groupby([col1]).size())

print(col1, col2, pair, single)'",No,3,71.0
col4train = [x for x in col4train if x!='ROLE_TITLE'],No,5,77.0
"#linear - OHE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')",Yes,4,20.0
"X = ohe.fit_transform(train[col4train])
y = train[""ACTION""].values",No,4,21.0
"from sklearn.model_selection import cross_validate

model = LogisticRegression(
                penalty='l2',  
                C=1.0, 
                fit_intercept=True, 
                random_state=432,
                solver = 'liblinear',
                max_iter = 1000,
        )
stats = cross_validate(model, X, y, groups=None, scoring='roc_auc', 
                       cv=5, n_jobs=2, return_train_score = True)
stats = pd.DataFrame(stats)
stats.describe().transpose()",No,3,4.0
"X = ohe.fit_transform(train[col4train])
y = train[""ACTION""].values
X_te = ohe.transform(test[col4train])

model.fit(X,y)
predictions = model.predict_proba(X_te)[:,1]

submit = pd.DataFrame()
submit[""Id""] = test[""id""]
submit[""ACTION""] = predictions

submit.to_csv(""submission.csv"", index = False)",Yes,3,7.0
"# Loading data directly from CatBoost
from catboost.datasets import amazon
train, test = amazon()
target = ""ACTION""
col4train = [x for x in train.columns if x not in [target, ""ROLE_TITLE""]]
y = train[target].values",No,3,13.0
"from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

#our small helper function, returns ExtraTrees instance
def get_model():
    params = {
        ""n_estimators"":300, 
        ""n_jobs"": 3,
        ""random_state"":5436,
    }
    return ExtraTreesClassifier(**params)",No,3,59.0
"from sklearn.base import BaseEstimator, TransformerMixin
class TargetEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names ):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
    
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        X_[""__target__""] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            self.learned_values[c] = (X_[[c,""__target__""]]
                                      .groupby(c)[""__target__""].mean()
                                      .reset_index())
        self.dataset_mean = np.mean(y)
        return self
    
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )[""__target__""]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        return transformed_X
    
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
        return self.transform(X)'",No,5,20.0
"skf = StratifiedKFold(n_splits=5, random_state = 5451, shuffle = True)
te = TargetEncoding(columns_names=col4train)
X_tr = te.fit_transform(train, y).values

scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df, valid_df = X_tr[train_index], X_tr[test_index]
    train_y, valid_y = y[train_index], y[test_index]

    model = get_model()
    model.fit(train_df,train_y)

    predictions = model.predict_proba(valid_df)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(train_df)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,2,7.0
"scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    
    te = TargetEncoding(columns_names=col4train)
    X_tr = te.fit_transform(train_df, train_y).values
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,2,7.0
"class TargetEncodingSmoothing(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names,k, f ):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
        self.k = k #
        self.f = f #
    def smoothing_func(self, N): #
        return 1 / (1 + np.exp(-(N-self.k)/self.f))
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        self.dataset_mean = np.mean(y)
        X_[""__target__""] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            stats = (X_[[c,""__target__""]]
                     .groupby(c)[""__target__""].
                     agg(['mean', 'size'])) 
            stats[""alpha""] = self.smoothing_func(stats[""size""])
            stats[""__target__""] = (stats[""alpha""]*stats[""mean""] 
                                   + (1-stats[""alpha""])*self.dataset_mean)
            stats = (stats
                     .drop([x for x in stats.columns if x not in [""__target__"",c]], axis = 1)
                     .reset_index())
            self.learned_values[c] = stats
        self.dataset_mean = np.mean(y)
        return self
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )[""__target__""]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        return transformed_X
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
        return self.transform(X)'",No,3,20.0
"%matplotlib inline
x = np.linspace(0,100,100)
plot = pd.DataFrame()
te = TargetEncodingSmoothing([], 1,1)
plot[""k=1|f=1""] = te.smoothing_func(x)
te = TargetEncodingSmoothing([], 33,5)
plot[""k=33|f=5""] = te.smoothing_func(x)
te = TargetEncodingSmoothing([], 66,15)
plot[""k=66|f=15""] = te.smoothing_func(x)
plot.plot(figsize = (15,8))",No,5,81.0
"scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingSmoothing(
        columns_names= col4train,
        k = 3, f = 1.5
    )
    X_tr = te.fit_transform(train_df, train_y).values
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,1,7.0
"def get_CV_target_encoding(data, y, encoder, cv = 5):
    skfTE = StratifiedKFold(n_splits=cv, random_state = 545167, shuffle = True)
    result = []
    for train_indexTE, test_indexTE in skfTE.split(data, y):
        encoder.fit(data.iloc[train_indexTE,:].reset_index(drop = True), y[train_indexTE])
        tmp =  encoder.transform(data.iloc[test_indexTE,:].reset_index(drop = True))
        tmp[""index""] = test_indexTE
        result.append(tmp)
    result = pd.concat(result, ignore_index = True)
    result = result.sort_values('index').reset_index(drop = True).drop('index', axis = 1)
    return result'",Yes,2,20.0
"scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingSmoothing(
        columns_names= col4train,
        k = 3, f = 1.5
    )
    
    X_tr = get_CV_target_encoding(train_df, train_y, te, cv = 5)

    te.fit(train_df, train_y)
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,1,7.0
"class TargetEncodingExpandingMean(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        self.dataset_mean = np.mean(y)
        X_[""__target__""] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            stats = (X_[[c,""__target__""]]
                     .groupby(c)[""__target__""]
                     .agg(['mean', 'size'])) #
            stats[""__target__""] = stats[""mean""]
            stats = (stats
                     .drop([x for x in stats.columns if x not in [""__target__"",c]], axis = 1)
                     .reset_index())
            self.learned_values[c] = stats
        return self
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )[""__target__""]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        return transformed_X
    
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
    
        #Expanding mean transform
        X_ = X[self.columns_names].copy().reset_index(drop = True)
        X_[""__target__""] = y
        X_[""index""] = X_.index
        X_transformed = pd.DataFrame()
        for c in self.columns_names:
            X_shuffled = X_[[c,""__target__"", ""index""]].copy()
            X_shuffled = X_shuffled.sample(n = len(X_shuffled),replace=False)
            X_shuffled[""cnt""] = 1
            X_shuffled[""cumsum""] = (X_shuffled
                                    .groupby(c,sort=False)['__target__']
                                    .apply(lambda x : x.shift().cumsum()))
            X_shuffled[""cumcnt""] = (X_shuffled
                                    .groupby(c,sort=False)['cnt']
                                    .apply(lambda x : x.shift().cumsum()))
            X_shuffled[""encoded""] = X_shuffled[""cumsum""] / X_shuffled[""cumcnt""]
            X_shuffled[""encoded""] = X_shuffled[""encoded""].fillna(self.dataset_mean)
            X_transformed[c] = X_shuffled.sort_values(""index"")[""encoded""].values
        return X_transformed'",No,4,20.0
"scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingExpandingMean(columns_names=col4train)

    X_tr = te.fit_transform(train_df, train_y)
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,2,7.0
"train[col4train] = train[col4train].values.astype(str)
test[col4train] = test[col4train].values.astype(str)

from itertools import combinations
new_col4train = col4train
for c1,c2 in combinations(col4train, 2):
    name = ""{}_{}"".format(c1,c2)
    new_col4train.append(name)
    train[name] = train[c1] + ""_"" + train[c2]
    test[name] = test[c1] + ""_"" + test[c2]",No,3,78.0
"print(train[new_col4train].shape, test[new_col4train].shape)
train[new_col4train].head(5)",No,4,58.0
train[new_col4train].apply(lambda x: len(x.unique())),No,5,54.0
"scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,new_col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,new_col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingExpandingMean(columns_names=new_col4train)

    X_tr = te.fit_transform(train_df, train_y)
    X_val = te.transform(valid_df)
    
    te2 = TargetEncodingSmoothing(
        columns_names= new_col4train,
        k = 3, f = 1.5,
    )
    
    X_tr2 = get_CV_target_encoding(train_df, train_y, te2, cv = 5)
    te2.fit(train_df, train_y)
    X_val2 = te2.transform(valid_df)
    
    X_tr = pd.concat([X_tr, X_tr2], axis = 1)
    X_val = pd.concat([X_val, X_val2], axis = 1)

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))",Yes,1,7.0
"te = TargetEncodingExpandingMean(columns_names=new_col4train)

X_tr = te.fit_transform(train[new_col4train], y)
X_val = te.transform(test[new_col4train])

te2 = TargetEncodingSmoothing(
    columns_names= new_col4train,
    k = 3, f = 1.5,
)

X_tr2 = get_CV_target_encoding(train[new_col4train], y, te2, cv = 5)
te2.fit(train[new_col4train], y)
X_val2 = te2.transform(test[new_col4train])

X = pd.concat([X_tr, X_tr2], axis = 1)
X_te = pd.concat([X_val, X_val2], axis = 1)

model = get_model()
model.fit(X,y)
predictions = model.predict_proba(X_te)[:,1]

submit = pd.DataFrame()
submit[""Id""] = test[""id""]
submit[""ACTION""] = predictions

submit.to_csv(""submission.csv"", index = False)",Yes,1,7.0
"import numpy as np 
import pandas as pd 

import os
print(os.listdir(""../input""))
",No,5,88.0
"train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')",No,5,45.0
"id=test.iloc[:,0].values
test.drop('id',axis=1)",No,5,10.0
id,No,5,53.0
"X = train.iloc[:, 1:11].values
y = train.iloc[:, 0].values",No,5,21.0
"from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)",No,5,13.0
"from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)",No,5,18.0
"# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 99, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)",Yes,3,7.0
y_pred = classifier.predict(X_test),No,5,48.0
"from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)",No,5,49.0
cm,No,5,53.0
"#for calculating accuracy
(131+6090)/(131+6090+92+241)",No,5,53.0
"test.drop(['id'], axis=1, inplace = True)",No,5,10.0
test,No,5,41.0
test_pred = classifier.predict(test),No,5,48.0
test_pred,No,5,41.0
"submission = pd.DataFrame({'Id':id,'Action':test_pred})",No,5,12.0
"final_submission=submission.iloc[0:58921,:].values",No,5,14.0
final_submission,No,5,41.0
"final_submission =  pd.DataFrame({'Id':final_submission[:,0],'Action':final_submission[:,-1]})",No,5,12.0
"filename = 'Amazon Employee Access .csv'

final_submission.to_csv(filename,index=False)

print('Saved file: ' + filename)",No,4,25.0
"import seaborn as sns
import numpy as np # linear algebra
import pandas as pd 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier",No,5,22.0
"train_file_path = '/kaggle/input/covid19-global-forecasting-week-2/train.csv'
test_file_path = '/kaggle/input/covid19-global-forecasting-week-2/test.csv'

train_dataset = pd.read_csv(train_file_path)
test_dataset = pd.read_csv(test_file_path)
",No,5,45.0
"print(""Understanding of Train Dataset:\
\
\
"")

print('Train Dataset has following states:\
')
province_state = train_dataset['Province_State'].unique()
print(province_state)


print('\
\
\
Train Dataset has following Country Region:\
')
country_region = train_dataset['Country_Region'].unique()
print(country_region)


print('\
\
\
Train Dataset has records of following dates:\
')
dates = train_dataset['Date'].unique()
print(dates)
#convert to mm/dd/yyyy
train_dataset['Date'] = pd.to_datetime(train_dataset['Date'])
print('\
Train Dataset has following Date Range:')
print(pd.date_range(start=train_dataset['Date'].min(), end=train_dataset['Date'].max()))


'",No,3,16.0
"print(""Understanding of Test Dataset:\
\
\
"")

print('Test Dataset has following states:\
')
province_state = test_dataset['Province_State'].unique()
print(province_state)


print('\
\
\
Test Dataset has following Country Region:\
')
country_region = test_dataset['Country_Region'].unique()
print(country_region)


print('\
\
\
Test Dataset has records of following dates:\
')
dates = test_dataset['Date'].unique()
print(dates)
#convert to mm/dd/yyyy
test_dataset['Date'] = pd.to_datetime(test_dataset['Date'])
print('\
Test Dataset has following Date Range:')
print(pd.date_range(start=test_dataset['Date'].min(), end=test_dataset['Date'].max()))


'",No,4,57.0
"print(""Train Dataset Graphical Representation of Counrtry Region w.r.t. Confirmed Cases"")
show_cumulatively = train_dataset.groupby(by='Country_Region')[['ConfirmedCases','Fatalities']].max().reset_index()
plt.figure(figsize=(20,10))
#sns.set()
sns.barplot(x='ConfirmedCases',y='Country_Region',data=show_cumulatively[show_cumulatively['ConfirmedCases'] != 0].sort_values(by='ConfirmedCases',ascending=False).head(50))
'",No,4,81.0
"

print(""Train Dataset Graphical Representation of Counrtry Region w.r.t. Fatalities"")
plt.figure(figsize=(20,10))
sns.barplot(x='Fatalities',y='Country_Region',data=show_cumulatively[show_cumulatively['Fatalities'] != 0].sort_values(by='Fatalities',ascending=False).head(50))'",No,4,81.0
"print('Those Country Regions of Train Dataset whose Confirmed Cases have Fatalities')
non_fatalities_train_df = train_dataset[train_dataset['Fatalities'] != 0] 
non_fatalities_train_df[['Country_Region','Date','ConfirmedCases','Fatalities']]",No,4,71.0
"print('Those Country Regions whose Confirmed Cases have not Fatalities')
non_fatalities_train_df = train_dataset[train_dataset['Fatalities'] == 0] 
non_fatalities_train_df[['Country_Region','Date','ConfirmedCases','Fatalities']]",No,3,71.0
"b""print('The value count of Country Regions of Non-Null Province States in Train Dataset')\nprint(train_dataset[~train_dataset['Province_State'].isnull()]['Country_Region'].value_counts())\n\n\nprint('\\n\\n\\nThe value count of Country Regions of Null Province States in Train Dataset')\nprint(train_dataset[train_dataset['Province_State'].isnull()]['Country_Region'].value_counts())""",No,5,39.0
"b""print('The value count of Country Regions of Non-Null Province States in Test Dataset')\nprint(test_dataset[~test_dataset['Province_State'].isnull()]['Country_Region'].value_counts())\n\n\nprint('\\n\\n\\nThe value count of Country Regions of Null Province States in Test Dataset')\nprint(test_dataset[test_dataset['Province_State'].isnull()]['Country_Region'].value_counts())""",No,5,39.0
"print(""Train dataset before pre-processing:\
"")
print(train_dataset.head())

train_dataset = train_dataset.fillna('Enpyty_value')

print(""\
\
\
Train dataset after pre-processing:\
"")
print(train_dataset.head())'",No,4,17.0
"print(""Test dataset before pre-processing:\
"")
print(test_dataset.head())

test_dataset = test_dataset.fillna('Enpyty_value')

print(""\
\
\
Test dataset after pre-processing:\
"")
print(test_dataset.head())'",No,4,17.0
"labelEncoder = LabelEncoder()
train_dataset['Date'] = pd.to_datetime(train_dataset['Date']).dt.strftime(""%m%d"").astype(int) 
train_dataset['Date'] -= 122 
test_dataset['Date'] = pd.to_datetime(test_dataset['Date']).dt.strftime(""%m%d"").astype(int) 
test_dataset['Date'] -= 122 

train_dataset.Province_State = labelEncoder.fit_transform(train_dataset.Province_State)
train_dataset.Country_Region = labelEncoder.fit_transform(train_dataset.Country_Region)

test_dataset.Province_State = labelEncoder.fit_transform(test_dataset.Province_State)
test_dataset.Country_Region = labelEncoder.fit_transform(test_dataset.Country_Region)


print('\
\
\
Train Dataset After Encoding')
print(train_dataset.head(5))
      

print('\
\
\
Test Dataset After Encoding')
print(test_dataset.head(5))

'",No,3,8.0
"#We don't need to convert it into vector because it is alreayd in vector form. See following

print(train_dataset.head())
print(test_dataset.head())

",No,4,84.0
"X = train_dataset[['Province_State','Country_Region','Date']]
y = train_dataset[['ConfirmedCases','Fatalities']]

classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X, y[['Fatalities']])

# predict
predictions_fatalities = classifier.predict(test_dataset[['Province_State','Country_Region','Date']])

# train
classifier.fit(X, y[['ConfirmedCases']])

# predict
predictions_confirmed_cases = classifier.predict(test_dataset[['Province_State','Country_Region','Date']])
",No,3,7.0
"output_confirmed_cases_df = pd.DataFrame(data=predictions_confirmed_cases.toarray())
output_fatalities_df = pd.DataFrame(data=predictions_fatalities.toarray())

output_confirmed_cases_df = output_confirmed_cases_df.rename(columns={0: ""ConfirmedCases""})
output_fatalities_df = output_fatalities_df.rename(columns={0: ""Fatalities""})",No,5,55.0
test_dataset.ForecastId,No,4,84.0
"result.to_csv('submission.csv', index=False)
",No,5,25.0
"filepath= '/kaggle/input/amazon-employee-access-challenge/train.csv'
traindata= pd.read_csv(filepath)

filepath2= '/kaggle/input/amazon-employee-access-challenge/test.csv'
testdata= pd.read_csv(filepath2)
testdatacopy=testdata
traindata.head()",No,4,45.0
"##Thus we see that there are no null values
sns.heatmap(traindata.isnull(),yticklabels=False,cbar=False,cmap='viridis')",No,5,80.0
"#Now we plot the number of people who were granted access 
sns.set_style('whitegrid')
sns.countplot(x='ACTION',data=traindata,palette='RdBu_r')",No,5,33.0
"y=traindata['ACTION']
x=traindata.drop('ACTION',axis=1)
#Splitting training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",No,4,21.0
"#Logistic Regression
LogisticRegressor = LogisticRegression(max_iter=10000)
LogisticRegressor.fit(x_train, y_train)
y_predicted = LogisticRegressor.predict(x_test)
mse = mean_squared_error(y_test, y_predicted)
r = r2_score(y_test, y_predicted)
mae = mean_absolute_error(y_test,y_predicted)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)
print('f1 score:')
print(f1_score(y_test,y_predicted))
print('accuracy score:')
print(accuracy_score(y_test,y_predicted))
'",Yes,2,7.0
"# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train);
y_predicted_r = rf.predict(x_test)
mse = mean_squared_error(y_test, y_predicted_r)
r = r2_score(y_test, y_predicted_r)
mae = mean_absolute_error(y_test,y_predicted_r)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)
print('f1 score:')
print(f1_score(y_test,y_predicted_r))
print('accuracy score:')
print(accuracy_score(y_test,y_predicted_r))
'",Yes,2,7.0
"# Decision Tree - CART
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train, y_train)
y_predicted_d = regressor.predict(x_test)
mse = mean_squared_error(y_test, y_predicted_d)
r = r2_score(y_test, y_predicted_d)
mae = mean_absolute_error(y_test,y_predicted_d)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)
print('f1 score:')
print(f1_score(y_test,y_predicted_d))
print('accuracy score:')
print(accuracy_score(y_test,y_predicted_d))
'",Yes,2,7.0
"#XGBClassifier
xgboost = XGBClassifier(n_estimators=1000)
xgboost.fit(x_train,y_train)
xg_pred = xgboost.predict(x_test)
msee21 = mean_squared_error(y_test, xg_pred)
ra21 = r2_score(y_test, xg_pred)
maee21 = mean_absolute_error(y_test,xg_pred)
print(""Mean Squared Error:"",msee21)
print(""R score:"",ra21)
print(""Mean Absolute Error:"",maee21)
print('f1 score:')
print(f1_score(y_test,xg_pred))
print('accuracy score:')
print(accuracy_score(y_test,xg_pred))'",No,2,7.0
"#SVM

svclassifier = SVC(kernel='linear')
svclassifier.fit(x_train, y_train)
y_pred2 = svclassifier.predict(x_test)

mseew = mean_squared_error(y_test, y_pred2)
ra = r2_score(y_test, y_pred2)
maeew = mean_absolute_error(y_test,y_pred2)
print(""Mean Squared Error:"",mseew)
print(""R score:"",ra)
print(""Mean Absolute Error:"",maeew)

print('f1 score:')
print(f1_score(y_test,y_pred2))
print('accuracy score:')
print(accuracy_score(y_test,y_pred2))'",Yes,2,7.0
"#Naive Bayes

gnb = GaussianNB()
y_preed = gnb.fit(x_train, y_train).predict(x_test)

ms = mean_squared_error(y_test, y_preed)
rae = r2_score(y_test, y_preed)
mew = mean_absolute_error(y_test,y_preed)
print(""Mean Squared Error:"",ms)
print(""R score:"",rae)
print(""Mean Absolute Error:"",mew)

print('f1 score:')
print(f1_score(y_test,y_preed))
print('accuracy score:')
print(accuracy_score(y_test,y_preed))'",Yes,2,7.0
"#KNN
math.sqrt(len(y_test))
#Therefore n neighbors=99
",No,5,53.0
"#KNN
classify= KNeighborsClassifier (n_neighbors=99, p =2, metric= 'euclidean')
classify.fit(x_train,y_train)
ypred1=classify.predict(x_test)

msee = mean_squared_error(y_test, ypred1)
r = r2_score(y_test, y_predicted_d)
maee = mean_absolute_error(y_test,ypred1)
print(""Mean Squared Error:"",msee)
print(""R score:"",r)
print(""Mean Absolute Error:"",maee)

print('f1 score:')
print(f1_score(y_test,ypred1))
print('accuracy score:')
print(accuracy_score(y_test,ypred1))'",No,2,7.0
"testdata=testdata.drop('id',axis=1)",No,5,10.0
"# Random Forest
rf = RandomForestClassifier()
rf.fit(x,y)
Prediction = rf.predict(testdata)
",Yes,3,7.0
"predictionlist=Prediction.tolist()
Passengerid=testdatacopy['id'].tolist() 
output=pd.DataFrame(list(zip(Passengerid, predictionlist)),
              columns=['id','Action'])
output.head()
output.to_csv('my_submission(AmazonEmployeeAccess).csv', index=False)  ",No,3,25.0
"plt.figure(figsize=(25,12))
mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), cmap='coolwarm', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
plt.show() ",No,5,80.0
df_train = df[df['part']== 'train'],No,5,14.0
"cols_train = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Promo','PromoOpen',
#               'shift_sales','shift_t7_sales', 'shift_t30_sales','shift_customer','shift_t7_customer', 'shift_t30_customer',
           'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
           'CompetitionDistance', 'Promo2', 'Month', 'Year', 'Day','IsPromoMonth',
#            'Sales_DayOfWeek', 'Sales_Promo', 'Sales_Promo2', 'Sales_Month', 'Sales_Year', 'Sales_Day', 'Sales_StateHoliday', 'Sales_StoreType',
#            'Sales_Assortment', 'Customers_DayOfWeek', #'isBeforeCompetition', 'Customers_Promo', 'Customers_Promo2', 'Customers_Month',
#            'Customers_Year', 'Customers_Day', 'Customers_StateHoliday','Customers_StoreType', 'Customers_Assortment',
#            'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
           'Sales_StrType_DayOfWeek','SalesPerCustomer_StrType_Assortment', 'Customers_StrType_Promo2',
           'Sales_StrType_Promo', 'Sales_StrType_Promo2', 'Sales_StrType_Month','Customers_StrType_StateHoliday',
           'Sales_StrType_Year', 'Sales_StrType_Day', 'Sales_StrType_StateHoliday','SalesPerCustomer_StrType_Promo2',
           'Sales_StrType_Assortment', 'Customers_StrType_DayOfWeek', 'Customers_StrType_Promo', 
           'Customers_StrType_Month', 'Customers_StrType_Year', 'Customers_StrType_Day', 'SalesPerCustomer_StrType_StateHoliday',
           'Customers_StrType_Assortment', 'SalesPerCustomer_StrType_DayOfWeek', 'SalesPerCustomer_StrType_Promo', 
           'SalesPerCustomer_StrType_Month', 'SalesPerCustomer_StrType_Year', 'SalesPerCustomer_StrType_Day', 
#          'Sales_StrType_Quarter', 'Customers_StrType_Quarter', 'Sales_Year_Quarter','Customers_Year_Quarter',   'Customers_Quarter','Sales_Quarter',
#            'Quarter'
             ]  ",No,5,77.0
"params = {""objective"": ""reg:linear"", # for linear regression
          ""booster"" : ""gbtree"",   # use tree based models 
          ""eta"": 0.02,   # learning rate
          ""max_depth"": 11,    # maximum depth of a tree
          ""subsample"": 0.9,    # Subsample ratio of the training instances
          ""colsample_bytree"": 0.7,   # Subsample ratio of columns when constructing each tree
          ""silent"": 1,   # silent mode
          ""seed"": 10,   # Random number seed
          'tree_method': 'gpu_hist',
          }
num_boost_round = 800


def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return ""rmspe"", rmspe(y,yhat)

import xgboost as xg",No,4,23.0
"tmp= pd.pivot_table(data, ['Date'], ""Store"", aggfunc=""count"").reset_index().sort_values('Date', ascending=False).head(300)
top_stores = tmp[""Store""].values'",No,3,8.0
"# from sklearn.manifold import TSNE
# from sklearn.preprocessing import StandardScaler


def process(x, cols=None, all_stores=False):
    x.sort_values(""Date"",inplace=True)
#     scaler = StandardScaler()
    
    if cols is None:
        cols = x.columns
        
    x = x.fillna(x.median())
       
#     for i in x.columns[(x.dtypes.values == np.dtype('float64'))]:
#         if i not in ['Id', 'Promo2SinceWeek', 'Promo2SinceYear','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Sales',
#                         'Quarter', 'WeekOfYear', 'PromoOpen', 'Promo2SinceWeek', 'Promo2SinceYear']:
#             x[i] = np.round(np.log1p(x[i]),2)
    
    x_train = x[x[""Date""]<=""2015-06-12""][cols].copy()
    x_test  = x[x[""Date""]>""2015-06-12""][cols].copy()

    store_test = x_test['Store'].unique().tolist()
    x_train = x_train[(x_train['Store'].isin(store_test))]
    
    y_train = np.log(x_train['Sales'])
    
    if all_stores:
        rmv = ['Date', 'Sales']
    else:
        rmv = ['Date', 'Sales', 'Store']
  
    x_train= x_train.drop(rmv, 1)
    x_train = pd.get_dummies(x_train)
    x_train_arr = x_train.values
    x_test_arr = pd.get_dummies(x_test.drop(rmv, 1)).values
    
    #scaler.fit(x_train_arr)
    #x_train_arr = scaler.transform(x_train_arr)
    #x_test_arr = scaler.transform(x_test_arr)
    #reduc = TSNE(n_components=2)
    #reduc.fit(x_train_arr)
    #x_train_arr = reduc.transform(x_train_arr)
    #x_test_arr = reduc.transform(x_test_arr)
   
    return x_train.columns, x_train_arr, y_train, x_test, x_test_arr'",Yes,4,1.0
"fig, ax = plt.subplots(5, 2, figsize=(25, 15))

X = df_train[df_train['Store'].isin(top_stores)] .copy()
X_train_col, X_train_arr, Y_train, X_test, X_test_arr = process(X, cols_train, True)

dtrain = xgb.DMatrix(X_train_arr, Y_train)
estimator = xgb.train(params, dtrain, num_boost_round, feval=rmspe_xg,)
Y_pred = estimator.predict(xgb.DMatrix(X_test_arr))
X_test[""Pred""] = np.exp(Y_pred)

scores = np.round(mean_squared_error(X_test['Sales'], X_test[""Pred""]))
cpt = 0

for i in top_stores[:5]: 
    
    x_train = df_train[df_train[""Store""]==i]
    x_test = X_test[X_test[""Store""]==i]
    ax[cpt, 0].plot(x_train[""Date""], x_train[""Sales""])
    ax[cpt, 0].plot(x_test[""Date""], x_test[""Pred""])
    ax[cpt, 0].set_title(i)

    ax[cpt, 1].scatter(x_test[""Date""].values, x_test['Sales'].values - x_test[""Pred""].values)
    ax[cpt, 1].plot(x_test[""Date""], [0 for _ in range(len(x_test))])
    ax[cpt, 1].set_title( np.round(mean_squared_error(X_test['Sales'], X_test[""Pred""])))

    #feat_importances = pd.Series(reg.feature_importances_, index=X_train_col)
    #feat_importances.nlargest(10).sort_values(ascending = True).plot(kind='barh', ax=ax[cpt, 2])
#     ax[cpt, 2].set_xlabel('importance')
    cpt+=1
    
    
plt.tight_layout()
print (np.mean(scores))'",Yes,3,1.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, TensorBoard, ModelCheckpoint
sns.set(style='white', context='notebook', palette='deep')",No,5,23.0
"### load data
train = pd.read_csv('../input/training/training.csv')
test = pd.read_csv('../input/test/test.csv')
sample = pd.read_csv('../input/SampleSubmission.csv')
look_id = pd.read_csv('../input/IdLookupTable.csv')",No,5,45.0
train.tail().T,No,5,41.0
"train.fillna(method='ffill', inplace=True)
train.tail().T",No,4,17.0
train.isnull().any().describe(),No,4,40.0
"Img = []
for i in range(7049):
    img = train[""Image""][i].split(' ')
    img = ['0' if x=='' else x for x in img]
    Img.append(img)'",No,4,17.0
"PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2'
df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv')
df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv')
df_train.head()
df_test.head()
df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)

df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)

df_train.info()
df_test.info()

y1_Train = df_train.iloc[:, -2]
y1_Train.head()
y2_Train = df_train.iloc[:, -1]
y2_Train.head()

EMPTY_VAL = ""EMPTY_VAL""

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state
'",No,3,45.0
"Y_train = train.drop('Image', axis=1)
Y_train = Y_train.values
Y_train = np.array(Y_train, dtype='float')
Y_train.shape, X_train.shape",No,4,21.0
"# keras CNN
# 
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(5,5), padding = 'same', activation = 'relu', input_shape = (96,96,1)))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Dropout(0.25))

model.add(Conv2D(filters=32, kernel_size=(3,3), padding = 'same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(30))",No,3,4.0
"optimizer = RMSprop(lr = 0.001, epsilon = 1e-8)
optimizer1 =Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer1, loss = ""mse"", metrics = [""accuracy""] )",No,3,4.0
"
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)
tensorboard = TensorBoard(log_dir = './output')
modelcheckpoint = ModelCheckpoint(filepath='./optimized_model.h5', monitor=""val_loss"", save_best_only=True, mode=""min"")
callback_list = [learning_rate_reduction, tensorboard, modelcheckpoint]'",No,4,28.0
"model1 = Sequential([Flatten(input_shape=(96,96,1)),
                         Dense(128, activation=""relu""),
                         Dropout(0.1),
                         Dense(64, activation=""relu""),
                         Dense(30)
                         ])

model1.compile(optimizer='adam', 
              loss='mse',
              metrics=['mae','accuracy'])'",No,3,4.0
"batch_size =100
epochs = 50
history = model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, callbacks=callback_list, validation_split=0.1, verbose = 2)",No,4,7.0
"# Plot the loss and accuracy curves for training and validation 
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label=""Training loss"")
ax[0].plot(history.history['val_loss'], color='r', label=""validation loss"",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['acc'], color='b', label=""Training accuracy"")
ax[1].plot(history.history['val_acc'], color='r',label=""Validation accuracy"")
legend = ax[1].legend(loc='best', shadow=True)'",No,5,35.0
"#preparing test data
timag = []
for i in range(0,1783):
    timg = test['Image'][i].split(' ')
    timg = ['0' if x == '' else x for x in timg]
    
    timag.append(timg)",No,3,14.0
"X_test = np.array(timag,dtype = 'float')
X_test = X_test/255
X_test = X_test.reshape(-1,96,96,1)
X_test.shape",No,4,21.0
"opt_model = load_model('./optimized_model.h5')
",No,5,30.0
"pred = model.predict(X_test, batch_size = 100)
pred.shape",No,4,48.0
"feature = []
for f in list( look_id['FeatureName']):
    feature.append(lookid_list.index(f))",No,2,8.0
"rowid = pd.Series(rowid,name = 'RowId')
loc = pd.Series(preded,name = 'Location')
submission = pd.concat([rowid,loc],axis = 1)
submission.to_csv('face_key_detection_submission.csv',index = False)",No,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import matplotlib.pyplot as plt
from PIL import Image
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,4,22.0
df = pd.read_csv('../input/training/training.csv'),No,5,45.0
"print(df.isnull().any().value_counts(), df.shape)
df.dropna(inplace=True)
#df.fillna(method = 'ffill',inplace = True)
#df.reset_index(drop = True, inplace = True)
print(df.isnull().any().value_counts(), df.shape)",No,3,17.0
"df = df.sample(frac=1)
img_data = df['Image'].values
df.drop('Image', inplace=True, axis=1)",No,4,10.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import Dense,Flatten,Dropout,Conv2D,MaxPooling2D
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
!unzip ../input/facial-keypoints-detection/training.zip -d train,No,5,44.0
!unzip ../input/facial-keypoints-detection/test.zip -d test,No,5,84.0
"train = pd.read_csv(""../working/train/training.csv"")",No,5,45.0
"test = pd.read_csv(""../working/test/test.csv"")",No,5,45.0
print(test),No,5,41.0
train.head().T,No,5,41.0
"train.fillna(method='ffill',inplace=True)",No,5,17.0
print(train),No,5,41.0
"len(train[""Image""][4].split(' '))'",No,3,40.0
"images = np.ndarray((7049,9216))
for i in range(7049):
    img = np.array(train[""Image""][i].split(' '))
    img = ['0' if x == '' else x for x in img]
    images[i,:] = img'",No,3,14.0
"Y_test = np.ndarray((1783,9216))
for i in range(1783):
    img = np.array(test[""Image""][i].split(' '))
    img = ['0' if x == '' else x for x in img]
    Y_test[i,:] = img'",No,4,17.0
"images = images.reshape(-1,96,96,1)",No,5,84.0
"Y_test = Y_test.reshape(-1,96,96,1)",No,5,84.0
images.shape,No,5,58.0
"plt.imshow(images[34].reshape(96,96),cmap='gray')",No,5,84.0
"train.drop('Image',axis=1)",No,5,10.0
"Y_train = np.array(train.drop(""Image"",axis=1),dtype='float')'",No,5,21.0
print(Y_train.shape),No,5,58.0
"model = Sequential()

model.add(Conv2D(32,(3,3),input_shape=(96,96,1),padding = 'SAME',activation='relu'))
model.add(Conv2D(32,(3,3),padding = 'SAME',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(64,(3,3),padding = 'SAME',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
model.add(Dropout(0.2))
          
model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
model.add(Dropout(0.2))
          
model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu'))
model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
model.add(Dropout(0.2))
          
model.add(Flatten())
model.add(Dense(units=512,activation='relu'))
model.add(Dense(units=30))
          
model.summary()",No,3,4.0
"model.compile(loss='mean_squared_error',optimizer='adam',metrics=['mae'])",No,3,4.0
"model.fit(images,Y_train,epochs=10,batch_size=256,validation_split=0.2)",No,5,7.0
pred = model.predict(Y_test),No,5,48.0
"lookid_data = pd.read_csv(""/kaggle/input/facial-keypoints-detection/IdLookupTable.csv"")",No,5,45.0
"lookid_list = list(lookid_data['FeatureName'])
imageID = list(lookid_data['ImageId']-1)
pre_list = list(pred)
rowid = lookid_data['RowId']
rowid=list(rowid)
feature = []
for f in list(lookid_data['FeatureName']):
    feature.append(lookid_list.index(f))
preded = []
for x,y in zip(imageID,feature):
    preded.append(pre_list[x][y])
rowid = pd.Series(rowid,name = 'RowId')
loc = pd.Series(preded,name = 'Location')
submission = pd.concat([rowid,loc],axis = 1)
submission.to_csv('face_key_detection_submission.csv',index = False)",Yes,4,25.0
"train_file = 'training.csv'
test_file = 'test.csv'
lookup_file = '../input/facial-keypoints-detection/IdLookupTable.csv'
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
lookup = pd.read_csv(lookup_file)
",No,4,45.0
"import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
import matplotlib as mpl
import matplotlib.pyplot as plt
!pip install py7zr
from keras.preprocessing.image import load_img,img_to_array
from py7zr import unpack_7zarchive
import shutil
import os
shutil.register_unpack_format('7zip', ['.7z'], unpack_7zarchive)
",No,4,87.0
"shutil.unpack_archive('/kaggle/input/cifar-10/train.7z', '/kaggle/working')",No,4,73.0
"
train_dir = os.listdir(""./train"");
train_dir_len = len(train_dir)
print("".\\\\train:\\t"",train_dir_len)
print(""files:\\t\\t"",train_dir[:3])'",No,5,88.0
"train_labels = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv',dtype=str)
train_images = pd.DataFrame(columns = ['id','label','path'],dtype=str)
test_labels = pd.read_csv('/kaggle/input/cifar-10/sampleSubmission.csv')
train_labels.info()",No,4,45.0
"path_base = '/kaggle/working/train/'

for index in range(0,train_dir_len):
    path = path_base + str(index+1)+'.png'
    if os.path.exists(path):
        train_images = train_images.append([{ 'id': str(train_labels['id'].iloc[index]),'path': path, 'label':train_labels['label'].iloc[index]}])
        
train_images.head(2)",No,3,41.0
train_images.head(2),No,5,41.0
"display_groupby = train_images.groupby(['label']).count()
display_groupby.head(10)",No,4,60.0
"class_names = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']
for name in  class_names:
    index = class_names.index(name)
    train_images.loc[train_images.label==name,'label'] = str(index)
        
display_groupby = train_images.groupby(['label']).count()
display_groupby.head(10)",No,4,60.0
"path_base = '/kaggle/working/train'
batch_size = 64
train_data_generator = ImageDataGenerator(
            rescale=1./255.,
            validation_split=0.2,
            horizontal_flip=True
            )
train_generator = train_data_generator.flow_from_dataframe(dataframe=train_images,
            directory=""./train/"",
            x_col=""path"",
            y_col=""label"",
            subset=""training"",
            batch_size=batch_size,
            shuffle=True,
            target_size=(32,32),
            class_mode=""categorical"")'",Yes,4,31.0
num_classes  = 10,No,5,77.0
"validation_generator = train_data_generator.flow_from_dataframe(dataframe=train_images,
            directory=""./train/"",
            x_col=""path"",
            y_col=""label"",
            subset=""validation"",
            batch_size=batch_size,
            shuffle=True,
            target_size=(32,32),
            class_mode=""categorical"")",No,5,84.0
"b""train_size = len(train_generator.filenames)\nvalidation_size = len(validation_generator.filenames)\nprint('validation_size:\\t',validation_size)\nprint('train_size:\\t\\t',train_size)""",No,5,58.0
"index = 0    
fig = plt.figure(figsize = (16,10))
for item in train_images.values[:20]:
    index += 1
    plt.subplot(5, 5, index)
    test_path = item[2]
    test_image = load_img(test_path, target_size=(32,32))
    plt.imshow(test_image)
    plt.colorbar()
    plt.grid(False)
    plt.axis(""off"")
    plt.title(class_names[int(item[1])])
plt.show()",No,5,84.0
"import warnings
warnings.filterwarnings(""ignore"")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score",No,4,22.0
"# Import dataset
df = pd.read_csv('../input/loan-default-prediction/train_v2.csv.zip')
df.head()",No,4,45.0
"# Check duplication in dataframe
df[df.duplicated()].shape",No,5,38.0
"# The number of each data type in the dataframe
df.dtypes.value_counts()",No,5,72.0
"# Loss Distribution
fig , ax = plt.subplots()
plt.hist(df['loss'], bins = 20, range=(0,100))
ax.set_ylim([0,3000])
plt.show()",No,5,33.0
"# Calculate percent of missing in each row
df['num_missing'] = df.isnull().sum(axis = 1)/df.shape[1]

# Drop row that percent of missing more than 20%
missing_row = df[df['num_missing'] > 0.20].index
df.drop(df.index[missing_row], inplace = True)
df.shape",No,4,17.0
"# Drop id and num_missing collumn
df.drop(columns = ['id','num_missing'], inplace = True)",No,5,10.0
"# Calculate percent of missing in each column
col_pct_miss = []
for col in df.columns:
    percent_miss = np.mean(df[col].isnull())*100
    if percent_miss > 0:
        col_pct_miss.append([col, percent_miss])
    
col_pct_miss_df = pd.DataFrame(col_pct_miss, columns = ['column_name','% of Missing']).sort_values(by = '% of Missing', ascending = False)
col_pct_miss_df",No,4,17.0
"# Impute missing value in numeric columns with median 
numeric_cols = df.select_dtypes(include=['number']).columns.values

for col in numeric_cols:
    if col in list(col_pct_miss_df.column_name) :
        med = df[col].median()
        df[col] = df[col].fillna(med)",No,5,17.0
"# Impute missing value in categorical columns with mode
not_numeric_cols = df.select_dtypes(exclude=['number']).columns.values

for col in not_numeric_cols:
    if col in list(col_pct_miss_df.column_name):
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)",No,5,17.0
"# Check missing value
df.isnull().sum().value_counts()",No,5,39.0
"# Drop Highly Corelated Columns

# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df.drop(columns = to_drop, inplace = True)",No,5,10.0
"#Drop Repetitive Columns
num_rows = df.shape[0]
rep_cols = []

for col in df.loc[:, df.columns != 'loss'].columns :
    cnts = df[col].value_counts()
    top_pct = (cnts/num_rows).iloc[0]
    
    if top_pct > 0.80:
        rep_cols.append([col,top_pct])
        
rep_col_df = pd.DataFrame(rep_cols, columns = ['column_name','% top repetitve value']).sort_values(by = '% top repetitve value', ascending = False).reset_index(drop=True)
rep_col_df

df.shape",No,3,10.0
"cat_cols = df.select_dtypes(exclude=['number']).columns.values

drop_cols = []
keep_cols = []
for col in cat_cols:
    if df[col].value_counts().count() > 20000 : 
        print('column {} has {} categories > drop'.format(col,df[col].value_counts().count()))
        drop_cols.append(col)
    else : 
        print('column {} has {} categories > keep'.format(col,df[col].value_counts().count()))
        keep_cols.append(col)",No,4,10.0
"# Binary Encoding
import category_encoders as ce
encoder = ce.BinaryEncoder(cols = keep_cols)
bi_enc_df = encoder.fit_transform(df[keep_cols])
bi_col_name = bi_enc_df.columns
bi_enc_df.head()

#Add Binary Encding to dataframe and drop all categorical columns
df = pd.concat([df,bi_enc_df],axis = 1)
df.head()",No,3,20.0
"# Add a 'loan_status' collumn which 1 represents default loan and 0 represents not default loan.
df['loan_status'] = np.where(df['loss'] > 0, 1, 0)
df.head()",No,5,8.0
"# After generate a visualization from loan_status in dataframe. 
# We found that the data is imbalance.

ax = sns.countplot(x = 'loan_status', data=df)
plt.show()

df['loan_status'].value_counts()",No,5,33.0
"from sklearn.model_selection import train_test_split

X = resample_df.drop(columns = ['loss','loan_status'])
Y = resample_df['loss']

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = resample_df['loan_status'])
print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))",No,4,21.0
"from sklearn.feature_selection import SelectPercentile , SelectKBest, f_regression , f_classif",No,5,22.0
"#Select top 170 important numerical columns with filter method
X_train_num = X_train.drop(columns = bi_col_name)

selector = SelectKBest(score_func = f_regression, k = 170)
selector.fit(X_train_num,Y_train)

select_cols = selector.get_support(indices = True)
select_num_cols = X_train_num.iloc[:,select_cols]

select_num_col_name = select_num_cols.columns
select_num_cols.head()",No,5,86.0
"#Select top 150 important numerical columns with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2)
selector = selector.fit(select_num_cols, Y_train)
select_cols = selector.get_support(indices = True)
select_cols_df = select_num_cols.iloc[:,select_cols]

best_X_col_name = select_cols_df.columns
select_cols_df.head()",No,5,86.0
"# Select top 5 important categorical columns with filter method

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

X_train_cat = X_train.select_dtypes(exclude = 'number').copy()

# Create encoder
le = LabelEncoder()
X_train_cat = X_train_cat.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')

# Prepare input data 
oe = OrdinalEncoder()
oe.fit(X_train_cat)
X_train_cat_enc = oe.transform(X_train_cat)

selector = SelectKBest(score_func = f_classif , k=5)
selector.fit(X_train_cat_enc,Y_train)

select_cols = selector.get_support(indices = True)
select_cat_cols = X_train_cat.iloc[:,select_cols]

select_cat_col_name = select_cat_cols.columns
select_cat_cols.head()",No,3,86.0
"#Combine categorical and non-categorical dataframe together
def filter_x_df(x):
    df = x.copy()
    all_filter_col = []
    
    for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]:
        filter_col = [col for col in df.columns if col.startswith(str(keep))]
        for col in filter_col : 
            if col not in keep_cols:
                all_filter_col.append(col)
        
    drop_cat_df = df.drop(columns = cat_cols)
    
    new_df = pd.concat([drop_cat_df[best_X_col_name],drop_cat_df[all_filter_col]],axis = 1)
    return new_df",No,5,11.0
filter_X_train.head(),No,5,41.0
"from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()  
scaler.fit(filter_X_train)

X_train_scal = scaler.fit_transform(filter_X_train)
X_test_scal = scaler.fit_transform(filter_X_test)",No,5,18.0
"from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier()
neigh.fit(X_train_scal, Y_train)

Knn_y_pred_train =  neigh.predict(X_train_scal)
Knn_y_pred_test =  neigh.predict(X_test_scal)

scores_kn = cross_val_score(estimator = neigh, y = Y_train, X = X_train_scal, cv=5)
print('Cross Validation Score:', np.mean(scores_kn))",Yes,4,7.0
"from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression()
logisticRegr = logisticRegr.fit(X_train_scal, Y_train)

Lr_y_pred_train = logisticRegr.predict(X_train_scal)
Lr_y_pred_test = logisticRegr.predict(X_test_scal)

scores_lr = cross_val_score(estimator = logisticRegr, y = Y_train, X = X_train_scal, cv=5)
print('Cross Validation Score:', np.mean(scores_lr))",Yes,4,7.0
"from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 70)
rf.fit(filter_X_train, Y_train)

rf_y_pred_train =  rf.predict(filter_X_train)
rf_y_pred_test =  rf.predict(filter_X_test)

scores_rf = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5)
print('Cross Validation Score:', np.mean(scores_rf))",Yes,4,7.0
"from  xgboost import XGBClassifier

xgb = XGBClassifier(gamma=0, learning_rate=0.1, max_depth=100, n_estimators=100)
xgb.fit(filter_X_train,Y_train)

xgb_y_pred_train = xgb.predict(filter_X_train)
xgb_y_pred_test = xgb.predict(filter_X_test)

scores_xg = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5)
print('Cross Validation Score:', np.mean(scores_xg))",Yes,4,7.0
"# Split Train Set & Test Set
from sklearn.model_selection import train_test_split

Y = X_sm['loss']
X = X_sm.drop(columns = 'loss')

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = Y_sm)
print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))",No,4,13.0
"# Select top 170 important numerical columns with filter method
X_train_num = X_train.drop(columns = bi_col_name)

selector = SelectKBest(score_func = f_regression, k = 170)
selector.fit(X_train_num,Y_train)

select_cols_sm = selector.get_support(indices = True)
select_num_cols_sm = X_train_num.iloc[:,select_cols_sm]

select_num_col_name = select_num_cols_sm.columns
select_num_cols_sm.head()",No,5,86.0
"# Select top 150 important numerical columns with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2)
selector = selector.fit(select_num_cols_sm, Y_train)
select_cols_sm = selector.get_support(indices = True)
select_cols_df_sm = select_num_cols_sm.iloc[:,select_cols_sm]

best_X_col_name_sm = select_cols_df_sm.columns
select_cols_df_sm.head()",No,5,86.0
"# Combine categorical and non-categorical dataframe together
def filter_x_df_sm(x):
    df = x.copy()
    all_filter_col = []
    
    for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]:
        filter_col = [col for col in df.columns if col.startswith(str(keep))]
        for col in filter_col : 
            if col not in keep_cols:
                all_filter_col.append(col)
                
    new_df = pd.concat([df[best_X_col_name_sm],df[all_filter_col]],axis = 1)
    return new_df",No,5,11.0
"#Standardize 
scaler = StandardScaler()  
scaler.fit(filter_X_train_sm)

X_train_scal_sm = scaler.fit_transform(filter_X_train_sm)
X_test_scal_sm = scaler.fit_transform(filter_X_test_sm)",No,5,18.0
"# Train Logistic Regression model 
logisticRegr_sm = LogisticRegression()
logisticRegr_sm = logisticRegr_sm.fit(X_train_scal_sm, Y_train)

Lr_y_pred_train_sm = logisticRegr_sm.predict(X_train_scal_sm)
Lr_y_pred_test_sm = logisticRegr_sm.predict(X_test_scal_sm)

scores_lr_sm = cross_val_score(estimator = logisticRegr_sm, y = Y_train, X = X_train_scal_sm, cv=5)
print('Cross Validation Score:', np.mean(scores_lr_sm))",Yes,4,7.0
"cat_cols = df.select_dtypes(exclude=['number']).columns.values

for col in cat_cols:
    if df[col].value_counts().count() > 20000 : 
        print('Column {} has {} categories'.format(col,df[col].value_counts().count()))",No,5,54.0
"b""featurename = filter_X_train.columns\nimportances = list(rf.feature_importances_)\n\nfeature_importances = [(feature, round(importance, 3)) for feature, importance in zip(featurename, importances)]\nfeature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)\n\nprint('Top 50 Importance Features\\n')\n[print('Variable: {} Importance Score: {}'.format(*pair)) for pair in feature_importances[:50]];""",No,5,86.0
"test_df = pd.read_csv('../input/loan-default-prediction/test_v2.csv.zip')
test_df.head()",No,5,45.0
"# Binary Encoding
encoder = ce.BinaryEncoder(cols = keep_cols)
bi_enc_df = encoder.fit_transform(test_df[keep_cols])
bi_col_name = bi_enc_df.columns

test_df = pd.concat([test_df,bi_enc_df],axis = 1)
test_df.head()",Yes,4,20.0
"# Create select_test_df by drop some columns in test_df
select_test_df = pd.concat([test_df['id'],test_df[filter_X_train.columns]],axis = 1)
select_test_df.head()",No,4,11.0
"# Check missing value
select_test_df.isnull().sum().value_counts()",No,5,39.0
"# Impute missing value in numeric columns with median 
numeric_cols = select_test_df.select_dtypes(include=['number']).columns.values

for col in numeric_cols:
    if col in list(col_pct_miss_df.column_name):
        med = df[col].median()
        select_test_df[col] = select_test_df[col].fillna(med)
        
not_numeric_cols = select_test_df.select_dtypes(exclude=['number']).columns.values

for col in not_numeric_cols:
        mode = df[col].mode()
        select_test_df[col] = select_test_df[col].fillna(mode[0])",No,5,17.0
"# Find columns that contain missing value
nan_columns = select_test_df.isna().any()
columns_with_nan = select_test_df.columns[nan_columns].tolist()
columns_with_nan",No,3,71.0
"# Replace missing value with zero
select_test_df[columns_with_nan] = select_test_df[columns_with_nan].fillna(0)",No,5,17.0
select_test_df.head(),No,5,41.0
"# Random Forest
test_df_rf = test_df.copy()
test_df_rf['loss'] = rf.predict(select_test_df.loc[:,select_test_df.columns != 'id'])
test_df_rf.head()",No,5,48.0
"# Export sample_submission of random forest
sample_submission = test_df_rf[['id','loss']]
sample_submission.to_csv('sample_submission_rf.csv', index = False)",No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
from collections import Counter
import warnings
warnings.filterwarnings(""ignore"")

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder,normalize,MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import seaborn as sns",No,5,23.0
"import tensorflow as tf

# GPU device Check.
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')",Yes,4,22.0
"# Reading data
train = pd.read_csv('../input/higgs-boson/training.zip')
test = pd.read_csv('../input/higgs-boson/test.zip')",No,5,45.0
"b""print(train.columns.values,'\\n')\nprint(test.columns.values)""",No,5,71.0
"train = train.drop(['Weight'], axis=1)",No,5,10.0
"print(train['Label'].value_counts())

rcParams['figure.figsize'] = 10,5
sb.barplot(x = train['Label'].value_counts().index, y = train['Label'].value_counts().values)
plt.title('Label counts')
plt.show()",No,5,33.0
"# getting dummy variables column

enc = LabelEncoder()

train['Label'] = enc.fit_transform(train['Label'])
train.head()",No,4,20.0
"y = train[""Label""]
X = train
X_test = test",No,5,21.0
"X.set_index(['EventId'],inplace = True)
X_test.set_index(['EventId'],inplace = True)
X = X.drop(['Label'], axis=1)

X.head()",No,4,21.0
X_test.head(),No,5,41.0
"#Normalizing

from sklearn.preprocessing import normalize

X = normalize(X)
X_test = normalize(X_test)",No,5,18.0
"b""# print(X.isnull().sum(),'\\n')\n# print(X_test.isnull().sum())""",No,5,53.0
"b""#print(X.isnull().sum(),'\\n')\n#print(X_test.isnull().sum())""",No,5,53.0
"import pandas as pd
from sklearn import ensemble",No,5,22.0
"# The competition datafiles are in the directory ../input
file_train = ""../input/train.csv""
file_test = ""../input/test.csv""
df_train = pd.read_csv(file_train)
df_test = pd.read_csv(file_test)
df_train.head()",Yes,3,45.0
"feature_cols = [col for col in df_train.columns if col not in ['Cover_Type','Id']]

X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y = df_train['Cover_Type'] # target
test_ids = df_test['Id'] # for submission",No,5,21.0
"clf = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0)
clf.fit(X_train, y)",Yes,3,4.0
"file_submission = ""rf200.submission.csv""
with open(file_submission, ""w"") as outfile:
    outfile.write(""Id,Cover_Type\
"")
    for e, val in enumerate(list(clf.predict(X_test))):
        outfile.write(""%s,%s\
""%(test_ids[e],val))'",No,5,25.0
"import time
import pandas as pd
from sklearn.cross_validation import train_test_split

from sklearn import ensemble
from sklearn.metrics import accuracy_score
import numpy as np

loc_test = ""../input/test.csv""
loc_train = ""../input/train.csv""
loc_submission = ""forest-cover-type-prediction.AspiringGuru.csv""

df_test = pd.read_csv(loc_test)
df_train = pd.read_csv(loc_train)

print (""type(df_test)="", type(df_test), ""df_test.shape="", df_test.shape)
print (""type(df_train)="", type(df_train), ""df_train.shape="", df_train.shape)

#build list of all columns except the ones we don't want.
# ('Cover_Type' is the predicted value), 'Id' is a unique row identifier
feature_cols = [col for col in df_train.columns if col not in ['Cover_Type', 'Id']]

#create dataframe of the columns desired from the input data for test and train
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
#create dataframe of the predicted value to use for building classifier
train_y = df_train['Cover_Type']
#
test_ids = df_test['Id']
#test_y = df_test['Cover_Type']

del df_train
del df_test

print (""creating classifier"")
start_time = time.time()
clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
print(""--- time to build ensemble.RandomForestClassifier %s seconds ---"" % (time.time() - start_time))
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#n_jobs=-1 : the number of jobs is set to the number of cores.(runs faster)
#n_estimators = The number of trees in the forest.

print (""fitting from train data"")
start_time = time.time()
clf.fit(X_train, train_y)
print(""--- time to clf.fit %s seconds ---"" % (time.time() - start_time))

print (""predicting from train data"")
start_time = time.time()
train_y_predicted = clf.predict(X_train)
print(""--- time to clf.predict %s seconds ---"" % (time.time() - start_time))
print (""type(train_y_predicted)="", type(train_y_predicted), ""len(train_y_predicted)"", len(train_y_predicted), ""train_y_predicted.shape"", train_y_predicted.shape)
print (""type(train_y)="", type(train_y), ""train_y.shape="", train_y.shape)
print (""train_y = "", list(train_y[0:20, ]) )
print (""train_y_predicted = "", list(train_y_predicted[0:20, ]) )

print (""predicting from test data"")
start_time = time.time()
predicted = clf.predict(X_test)
print (""--- time to clf.predict %s seconds ---"" % (time.time() - start_time))
print (""type(predicted)="", type(predicted), ""len(predicted)"", len(predicted) )
print  (""type(test_ids)="", type(test_ids), ""len(test_ids)="", len(test_ids) )


print (""calculating accuracy_score on train data."")
start_time = time.time()
score = accuracy_score(train_y, train_y_predicted)
print (""--- time to calcualte accuracy_score %s seconds ---"" % (time.time() - start_time))
print (""type(score)="", type(score), ""score="", score )

#for i in range(len(predicted)):

print (""clf.predicting & writing to file"")
start_time = time.time()
with open(loc_submission, ""w"") as outfile:
    outfile.write(""Id,Cover_Type\
"")
    for e, val in enumerate(list(clf.predict(X_test))):
        outfile.write(""%s,%s\
"" % (test_ids[e], val))
print(""--- time to clf.predict & write to file %s seconds ---"" % (time.time() - start_time))

'",Yes,1,22.0
"df=pd.read_csv(""../input/train.csv"")
test=pd.read_csv(""../input/test.csv"")
y=df[""Cover_Type""]
x=df.iloc[:,:-1]
id=df.iloc[:,:1]",Yes,2,22.0
"from sklearn.tree import DecisionTreeClassifier
reg=DecisionTreeClassifier()
reg.fit(x,y)",Yes,2,4.0
pred=reg.predict(test),No,5,48.0
"mysubmission=pd.DataFrame({'Id':test.Id,'Cover_Type':pred})",No,5,55.0
"mysubmission.to_csv(""submission.csv"",index=False)",No,5,25.0
"temp = pd.read_csv(""submission.csv"")
temp",No,5,45.0
"from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns",No,5,22.0
"train_set = pd.read_csv('../input/train.csv')
test_set = pd.read_csv('../input/test.csv')",No,5,45.0
"display(train_set.head())
display(train_set.describe())",No,4,40.0
"display(train_set.keys())
display(len(train_set.keys()))",No,5,40.0
"# How about using this features directly? (Not using the scaling and normalization)
fig = plt.figure()
fig.set_size_inches(35, 35)
sns.set(font_scale=2)

# Delete 'Id' and change cover type to dummy variables
cont_var_train_set = train_set.drop('Id', axis=1).drop(cate_vars, axis=1)

# Categorical feature : cannot using correlation directly.
cont_var_train_set_dum = pd.get_dummies(cont_var_train_set, columns=['Cover_Type'])

correlation = cont_var_train_set_dum.corr()
sns.heatmap(correlation, cmap='viridis', annot=True, linewidths=3)",Yes,1,80.0
from sklearn.preprocessing import StandardScaler,No,5,22.0
"# using scaler
scaler = StandardScaler()
scaler.fit(scaled_feat)
scaled_feat = scaler.transform(scaled_feat)

scaled_feat = pd.DataFrame(scaled_feat, columns=cont_vars)
scaled_feat.head()",Yes,1,4.0
"fig = plt.figure()
fig.set_size_inches(35, 35)

correlation2 = pd.concat([scaled_feat, dummy_labels], axis=1).corr()
sns.heatmap(correlation2, cmap='viridis', annot=True, linewidths=3)",Yes,2,11.0
"from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report",No,5,22.0
"# Spliting the datasets
features = pd.concat([scaled_feat, train_set[cate_vars]], axis=1)
features.head()",Yes,2,11.0
"rf_model = RandomForestClassifier(max_depth=7, n_estimators=300)
rf_model.fit(x_train, y_train)",Yes,2,4.0
"# Predicting naively
pred = rf_model.predict(x_test)

display(accuracy_score(y_test, pred))
display(classification_report(y_test, pred))",Yes,3,48.0
"# See the importance of features
importances = rf_model.feature_importances_
indices = np.argsort(importances)

fig = plt.figure()
fig.set_size_inches(20, 20)
sns.set(font_scale=1.5)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features.keys()[indices])
plt.xlabel('Relative Importance')",Yes,3,79.0
"# dimensional reduction
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=None, random_state=20180425)
pca.fit(features)",Yes,2,22.0
"pca_var = pca.explained_variance_ratio_

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
ax1, ax2 = ax.flatten()

ax1.plot(pca_var)
ax2.plot(np.cumsum(pca_var))",No,5,33.0
train_set.head(),No,5,41.0
"wilderness_area_col = train_set['Wilderness_Area'].astype(int)
soil_type_col = train_set['Soil_Type'].astype(int)

display(wilderness_area_col.head())
display(soil_type_col.head())",Yes,3,16.0
import scipy.stats as ss,No,5,22.0
"cate_vars_1 = ['Wilderness_Area', 'Soil_Type']",No,5,77.0
"input_features = pd.concat([scaled_feat, wilderness_area_col, soil_type_col], axis=1)
labels = train_set['Cover_Type']

display(input_features.head())
display(labels.head())",Yes,1,11.0
"x_train, x_test, y_train, y_test = train_test_split(input_features, labels, random_state=20190501, test_size=0.3)",No,5,13.0
test_set_rf = test_set.copy(),No,5,77.0
"test_set_rf_cont = test_set_rf[cont_vars]

scaler.fit(test_set_rf_cont)
test_set_rf_cont = scaler.transform(test_set_rf_cont)
test_set_rf_cont = pd.DataFrame(test_set_rf_cont, columns=cont_vars)
test_set_rf_cate = test_set_rf[cate_vars]

scaled_test_set_rf = pd.concat([test_set_rf_cont, test_set_rf_cate], axis=1)
scaled_test_set_rf.head()",Yes,2,12.0
"rf_pred = rf_model.predict(scaled_test_set_rf)
rf_result = pd.concat([test_set['Id'], pd.DataFrame({'Cover_Type': rf_pred})], axis=1)
rf_result.to_csv(""rf_submission.csv"", index=False)'",Yes,3,48.0
"# 1. scaling the continous features
test_cont_feat = test_set_copy[cont_vars]
scaler.fit(test_cont_feat)
test_scaled_cont_feat = scaler.transform(test_cont_feat)
test_scaled_cont_feat = pd.DataFrame(test_scaled_cont_feat, columns=cont_vars)

# 2. categorical features
test_cate_feat = test_set_copy[cate_vars_1].astype(int)

# 3. concat
test_input_features = pd.concat([test_scaled_cont_feat, test_cate_feat], axis=1)",Yes,1,12.0
"display(test_cont_feat.head())
display(test_scaled_cont_feat.head())
display(test_input_features.head())",No,5,41.0
"result = pd.concat([test_set['Id'], pd.DataFrame({'Cover_Type': result})], axis=1)",Yes,4,11.0
result.head(),No,5,41.0
"result.to_csv(""submission.csv"", index=False)",No,5,25.0
"# -*- coding: utf-8 -*-
""""""
Cincia de Dados e Visualizao com Python
Exemplo: Forest Cover Type
URL (problema): https://www.kaggle.com/c/forest-cover-type-prediction
URL (soluo): https://www.kaggle.com/ivarvb/forest-cover-type
Autor: Ivar Vargas Belizario
E-mail: ivar@usp.br
""""""

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


from sklearn.metrics import accuracy_score

from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

""""""
===================================================
I. Cincia de dados
===================================================

===================================================
1. Leitura dos datos para o treino e para o teste
===================================================
""""""

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

""""""
===================================================
2. Pre-processamento
===================================================

===================================================
2.1 Limpeza e amostragem
===================================================
""""""

train = train.fillna(0)
test = test.fillna(0)

# definir as colunas da etiqueta da classe (target) e do identificador (id)
column_target = 'Cover_Type'
column_id = 'Id'

# limpar os atributos que apresentam valores nulls
data = train.dropna(axis='columns')

# nmero de instancias antes da amostragem
print (""Total data: "",len(data))

# separao dos atributos: identificador da instancia (id)
# dos atributos data (X) e do atributo que contem a etiqueta da classe (y)
X = data
y = data[column_target]

# porcentagem para a amostragem
c_sample = 0.99

# amostragem
if c_sample < 1.0:
    X_null, X, y_null, y = train_test_split(X, y, test_size=c_sample, random_state=0)

ID = X[column_id]
y = X[column_target]
X = X.drop([column_id, column_target], axis=1).select_dtypes(include=[np.number])

train_select_atributes = X.columns

print (""Amostragem: "",len(X))


""""""
===================================================
2. Processamento
===================================================

===================================================
2.1 Reduo da dimensionalidade (feature selection)
===================================================
""""""

""""""
model = ExtraTreesClassifier()
model.fit(X, y)
imp = model.feature_importances_
names = []
for i in range(len(imp)):
    r = []
    r.append(i)
    r.append(imp[i])
    names.append(r)

names = sorted(names, key=lambda x: x[1], reverse=True)
fenames = []
columns = list(set(train_select_atributes))
for i in range(len(names)):
    fenames.append(columns[names[i][0]])

train_select_atributes = fenames[:30]

X = X[train_select_atributes].values
y = y.values
""""""

# convertir para arrays
X = X.values
y = y.values

""""""
===================================================
3. Modelo de aprendisagem (aprendisagem supervisionado):
===================================================

===================================================
3.1. Treinamento:
===================================================
""""""

# definir o modelo para a classificao
model = RandomForestClassifier(random_state=0, n_estimators=500)

# modelo de treinamento com k-fold (10-fold)
kf = StratifiedKFold(n_splits=10)
outcomes = []

# para cada fold
for train_index, test_index in kf.split(X, y):
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    model.fit(Xtrain, ytrain)
    expected = ytest
    predictions = model.predict(Xtest)

    accuracy = accuracy_score(ytest, predictions)
    outcomes.append(accuracy)

# imprimir a media da acuracia obtida no treinamento
mean_outcome = np.array(outcomes).mean()

print (""Mean Accuracy:"", mean_outcome)

""""""
===================================================
3.2. Teste:
===================================================
""""""

# seleco de atributos igual ao feito com o conjunto de treino
X_test = test[train_select_atributes]
x_test_id = test[column_id]
predictions = model.predict(X_test)

predictions = pd.DataFrame(predictions, columns = [""Cover_Type""])

# salvar resultados obtidos do conjunto de dados de teste
result = pd.concat([x_test_id, predictions], axis=1, sort=False)
result.to_csv(""result.csv"", mode = 'w', index=False)

""""""
===================================================
II. Visualizao do conjunto de dados (projees)
===================================================
""""""

#print (y)
isfineClass = False
for i in range(len(y)):
    if y[i]==0:
        isfineClass=True
        break;

if isfineClass==False:
    for i in range(len(y)):
        v = y[i]
        y[i] = v-1
        
# amostragem para a visualizao
c_sample = 0.1

if c_sample < 1.0:
    X_null, X, y_null, y = train_test_split(X, y, test_size=c_sample, random_state=0)
    
print (""Amostragem para a visualizao: "", len(X))

# visualizao por projees t-SNE
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X)

plt.figure(figsize=(6, 5))
colors = [""#1f77b4"", ""#ff7f0e"", ""#2ca02c"", ""#d62728"", ""#9467bd"", ""#8c564b"", ""#e377c2"", ""#7f7f7f"", ""#bcbd22"", ""#17becf""]

for i in range(len(y)):
    v = y[i]
    plt.plot(X_2d[i, 0], X_2d[i, 1], 'o', color=colors[v], alpha=0.3)
# visualiar a projeo
plt.show()
'",No,2,22.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns",Yes,4,22.0
"#data imports
data_train = pd.read_csv(""../input/train.csv"")
data_test = pd.read_csv(""../input/test.csv"")
data_train.head()",Yes,4,45.0
"keras.backend.clear_session()

model = keras.models.Sequential()
model.add(keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(32, 32,  3)))

model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Conv2D(64, (2, 2), activation='relu',padding='same'))
model.add(keras.layers.MaxPooling2D(1, 1))
model.add(keras.layers.Dropout(0.1))

model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Conv2D(64, (3, 3), activation='relu',padding='same'))
model.add(keras.layers.MaxPooling2D(2, 2))
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Conv2D(64, (2, 2), activation='relu',padding='same'))
model.add(keras.layers.MaxPooling2D(1, 1))
model.add(keras.layers.Dropout(0.1))

model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Dense(10, activation=""softmax""))

model.compile(loss=keras.losses.CategoricalCrossentropy(from_logits=True), 
          optimizer=keras.optimizers.RMSprop(lr=0.001, decay = 1e-3, momentum = 0.3),
          metrics=['accuracy'])
    
model.input '",No,5,4.0
"
history = model.fit(train_generator, 
                    steps_per_epoch=(train_size//batch_size),
                    epochs= 5,
                    validation_data=validation_generator,
                   validation_steps=(validation_size//batch_size)
                   )",No,5,7.0
test_labels.head(2),No,5,41.0
"if os.path.exists(""./test""):
    shutil.rmtree(""./test"")
if os.path.exists(""./train""):
    shutil.rmtree(""./train"")
if not os.path.exists(""./data""):
    os.mkdir(""./data"")

shutil.unpack_archive('/kaggle/input/cifar-10/test.7z', '/kaggle/working/data')'",Yes,2,88.0
"test_dir = os.listdir(""./data/test"");
test_dir_len = len(test_dir)
print('min:\\t',min(test_dir))
print('max:\\t',max(test_dir))
print("".\\\\test:\\t"",test_dir_len)
print(""files:\\t\\t"",test_dir[:3])'",No,3,88.0
"test_data_generator = ImageDataGenerator(rescale=1./255.)
test_generator = test_data_generator.flow_from_directory(directory='/kaggle/working/data',
            batch_size=batch_size,
            shuffle=False,color_mode='rgb',
            target_size=(32,32),
            class_mode=None)",No,5,84.0
predict_test = model.predict_generator(test_generator),No,5,48.0
"predict_generator = np.argmax(predict_test, axis=1)
print(class_names)
predict_generator[:2],[class_names[int(i)] for i in predict_generator[:2]]",No,4,14.0
"submission = pd.DataFrame(columns = ['id','label'],dtype=str)
submission[""label""] = [class_names[int(i)] for i in predict_generator]
submission[""id""] = [ (''.join(filter(str.isdigit, name ))) for name in test_generator.filenames]
submission.head(101)'",Yes,4,12.0
 submission.values[50:100],No,5,41.0
"index = 0    
fig = plt.figure(figsize = (16,10))
for item in submission.values[50:70]:
    index += 1
    plt.subplot(5, 5, index)
    test_path = '/kaggle/working/data/test/'+item[0]+'.png'
    test_image = load_img(test_path, target_size=(32,32))
    plt.imshow(test_image)
    plt.colorbar()
    plt.grid(False)
    plt.axis(""off"")
    plt.title(item[1])
plt.show()'",No,5,56.0
" shutil.rmtree(""./data"")",No,5,84.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC
from statistics import variance
from sklearn.feature_selection import VarianceThreshold


print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",Yes,5,88.0
"test = pd.read_csv(""../input/test.csv"")
train = pd.read_csv(""../input/train.csv"")",No,5,45.0
"train.sample()
",No,5,41.0
"#check for missing values
train.info()",No,5,40.0
"sns.heatmap(train.isnull(),cbar = False)",No,5,80.0
"distance=pd.DataFrame(train,columns = ['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
                                      'Hillshade_Noon','Horizontal_Distance_To_Fire_Points'])


for column in distance:
    plt.figure()
    distance.boxplot([column])",Yes,4,12.0
"#Cover type is the target to be predicted.
#Train test split
x_train,x_test,y_train,y_test=  train_test_split(train.drop('Cover_Type',axis = 1),train['Cover_Type'],test_size = 0.3,random_state = 17)",No,5,13.0
"#Building logistic regression model
logreg = LogisticRegression()
logreg.fit(x_train,y_train)",Yes,5,7.0
"#Predicting logistic regression results
logreg.predict(x_test)",No,5,48.0
"#Logistic regression test scores
score = logreg.score(x_test, y_test)
print(score)",No,5,49.0
"#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier",No,5,22.0
"tree_model = DecisionTreeClassifier()
ensemble_model = RandomForestClassifier()
",No,5,4.0
"tree_model.fit(x_train,y_train)",No,5,7.0
"ensemble_model.fit(x_train,y_train)",No,5,7.0
"tree_predict=tree_model.predict(x_test)
tree_model.score(x_test,y_test)",Yes,4,48.0
"ensemble_predict= ensemble_model.predict(test)
print (ensemble_predict)
ensemble_model.score(x_test,y_test)",Yes,4,48.0
submission.shape,No,5,58.0
"x_test.shape
test.shape
",No,5,58.0
"#current public score is 0.66,this should be improved
#checking the variance of each feature
train1 = train
test1 = test
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(train1)

train1.head(40)

#this is based on this article ,https://scikit-learn.org/stable/modules/feature_selection.html
#could see no rows being removed in the data set,as all of them have valid values,non null.
",Yes,3,86.0
tree_model.fit,No,5,7.0
"tree_model.predict(test1)
tree_model.score(x_test,y_test)",Yes,4,48.0
"pd.DataFrame([train.mean(), train.std(), train.var()], index=['Mean', 'Std. dev', 'Variance'])
",No,3,12.0
"x=pd.DataFrame(ensemble_model.feature_importances_,
             index=x_train.columns, columns=['Importance']).sort_values(
    by='Importance', ascending=False)[:10]
print(x)",No,4,79.0
"#Modelling based on important features alone
train2 = train
test2 = test
train_imp = train2[['Id','Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points',
                    'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Hillshade_9am',
                    'Aspect','Hillshade_3pm',
                    'Wilderness_Area4','Cover_Type']]

",No,5,12.0
"x_train_imp,x_test_imp,y_train_imp,y_test_imp=  train_test_split(train_imp.drop('Cover_Type',axis = 1),train_imp['Cover_Type'],
                                                                 test_size = 0.3,random_state = 17)
",No,5,13.0
"logreg1 = LogisticRegression()
logreg1.fit(x_train_imp,y_train_imp)
logreg1.predict(x_test_imp)
logreg1.score(x_test_imp,y_test_imp)",Yes,3,7.0
"tree_model1 =DecisionTreeClassifier()
tree_model1.fit(x_train_imp,y_train_imp)
tree_predict=tree_model1.predict(x_test_imp)
tree_model1.score(x_test_imp,y_test_imp)",Yes,3,7.0
"ensemble_1 = RandomForestClassifier()
ensemble_1.fit(x_train_imp,y_train_imp)
ensemble_predict= ensemble_1.predict(x_test_imp)
print (ensemble_predict)
ensemble_1.score(x_test_imp,y_test_imp)",Yes,3,7.0
"pd.DataFrame(tree_model.feature_importances_,index = x_train.columns,columns=['Importance']).sort_values(
    by = 'Importance',ascending = False)[:10]",No,4,79.0
"#Modelling based on important features alone
train = train.drop([""Soil_Type7"",""Soil_Type15"",""Wilderness_Area1"",""Wilderness_Area2"",""Wilderness_Area3"",""Slope"",
                  ""Hillshade_Noon""],axis = 1)
test = test.drop([""Soil_Type7"",""Soil_Type15"",""Wilderness_Area1"",""Wilderness_Area2"",""Wilderness_Area3"",""Slope"",
                  ""Hillshade_Noon""],axis = 1)
train3 = train
test3 = test

train[:10]
#train_imp = train3[train]

#[['Id','Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points',
 #                   'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Hillshade_9am',
  #                  'Aspect','Hillshade_3pm',
   #                 'Wilderness_Area4','Cover_Type']]'",No,4,10.0
"x_train_3,x_test_3,y_train_3,y_test_3=  train_test_split(train3.drop('Cover_Type',axis = 1),train3['Cover_Type'],
                                                                 test_size = 0.3,random_state = 17)
",No,4,13.0
"logreg2 = LogisticRegression()
logreg2.fit(x_train_3,y_train_3)
logreg2.predict(x_test_3)
logreg2.score(x_test_3,y_test_3)",Yes,4,7.0
"tree_model2 =DecisionTreeClassifier()
tree_model2.fit(x_train_3,y_train_3)
tree_predict2=tree_model2.predict(x_test_3)
tree_model2.score(x_test_3,y_test_3)
tree_test_pred = tree_model2.predict(test)",Yes,3,7.0
"ensemble_2 = RandomForestClassifier()
ensemble_2.fit(x_train_3,y_train_3)
ensemble_predict2= ensemble_2.predict(x_test_3)
print (ensemble_predict2)
ensemble_2.score(x_test_3,y_test_3)
ensemble_test_pred = ensemble_2.predict(test)
",Yes,4,7.0
"from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train_3,y_train_3)
nb.predict(x_test_3)
nb.score(x_test_3,y_test_3)",Yes,3,7.0
"sgd = SGDClassifier(loss = 'modified_huber',shuffle = True,random_state = 171)
sgd.fit(x_train_3,y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3,y_test_3)",Yes,3,7.0
"sgd = SGDClassifier(loss = 'log',shuffle = True,random_state = 171)
sgd.fit(x_train_3,y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3,y_test_3)",Yes,4,7.0
"sgd = SGDClassifier(shuffle = True,random_state = 171)
sgd.fit(x_train_3,y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3,y_test_3)",Yes,3,7.0
"submission = pd.DataFrame({'Id':test.Id,'Cover_Type':ensemble_test_pred})
submission.head()
submission.to_csv('submission.csv',index = False)",Yes,5,25.0
"submission_tree = pd.DataFrame({'Id':test.Id,'Cover_Type':tree_test_pred})
submission_tree.head()
submission_tree.to_csv('submission2.csv',index = False)",Yes,5,25.0
"#Extra tree classifier is a tree based model for classification problems
et = ExtraTreeClassifier()
et.fit(x_train_3,y_train_3)
et.predict(x_train_3)
et.score(x_test_3,y_test_3)",Yes,3,7.0
"from sklearn.semi_supervised import LabelPropagation
lb = LabelPropagation()
lb.fit(x_train_3,y_train_3)
lb.predict(x_train_3)
lb.score(x_test_3,y_test_3)",Yes,3,7.0
"from sklearn.neighbors import KNeighborsClassifier
knng =KNeighborsClassifier()
knng.fit(x_train_3,y_train_3)
knng.predict(x_train_3)
knng.score(x_test_3,y_test_3)",Yes,3,7.0
"features_soil = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']
data_train[""Soil_Count""] = data_train[features_soil].apply(sum, axis=1)
data_train.head()'",Yes,4,8.0
data_train.Soil_Count.describe(),No,5,40.0
data_test[features_soil].describe(),No,5,40.0
"data_train[""Soil_Type""] = data_train[features_soil].apply(np.argmax, axis=1)
data_train.head()",Yes,4,8.0
"data_train[""Soil_Type""] = data_train[""Soil_Type""].apply(lambda x: x.split(""Soil_Type"")[-1])
data_train.head()",Yes,4,8.0
"features_wilderness = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3','Wilderness_Area4']
data_train[""Wilderness_Area""] = data_train[features_wilderness].apply(sum, axis=1)
data_train.Wilderness_Area.describe()'",Yes,4,8.0
"data_train[""Wilderness_Area""] = data_train[features_wilderness].apply(np.argmax, axis=1)
data_train[""Wilderness_Area""] = data_train[""Wilderness_Area""].apply(lambda x: x.split(""Wilderness_Area"")[-1])
data_train.Wilderness_Area.head()",Yes,4,8.0
"sns.countplot(data_train.Cover_Type)
plt.show()",No,5,33.0
data_train.columns,No,5,71.0
"features = ['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', ""Cover_Type""]
sns.heatmap(data=data_train[features].corr(), annot=True, linecolor=""w"", fmt="".1"")
plt.show()'",No,5,80.0
"#Import pandas, tensorflow e keras
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.python.data import Dataset
import keras
from keras import regularizers
from keras.utils import to_categorical
from keras import models
from keras import layers
from keras import backend as K

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#   for filename in filenames:
#       print(os.path.join(dirname, filename))
#Lettura dati
df = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/train.csv"")
dfT = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/test.csv"")'",Yes,5,45.0
"data_train = pd.read_csv(""../input/train.csv"")
final_train = clear_dataset(data_train)",Yes,4,45.0
"#Selezioniamo le caratteristiche
x = df[df.columns[1:55]]
xT = dfT[dfT.columns[1:55]]
#Selezioniamo le etichette (8) 
y = df.Cover_Type
#Split data into train and test 
x_train, x_test, y_train, y_test = train_test_split(x, y , train_size = 0.7, random_state =  90)",Yes,4,13.0
"x_data = final_train.drop([""Cover_Type"", ""Id""], axis=1)
y_data = final_train[""Cover_Type""]",Yes,4,10.0
"# Normalize Training Data 
scaler = preprocessing.StandardScaler()
scaler.fit(x_train.values[:,0:10])
x_train_norm = scaler.transform(x_train.values[:,0:10])
x_test_norm = scaler.transform(x_test.values[:,0:10])
x_sub = scaler.transform(xT.values[:,0:10])
x_train_norm=numpy.concatenate((x_train_norm,x_train.values[:,10:]),axis=1)
x_test_norm=numpy.concatenate((x_test_norm,x_test.values[:,10:]),axis=1)
x_sub=numpy.concatenate((x_sub,xT.values[:,10:]),axis=1)",Yes,4,18.0
"modelF = models.Sequential()
modelF.add(layers.Dense(32,name=""Layer_1"",activation='relu',input_dim=54,kernel_initializer='he_normal',kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.08)))
modelF.add(layers.BatchNormalization())
modelF.add(layers.Dense(16,name=""Layer_2"",activation='relu'))
modelF.add(layers.Dense(64,name=""Layer_22"",activation='relu'))
modelF.add(layers.BatchNormalization())
modelF.add(layers.Dense(64,name=""Layer_23"",activation='relu'))
modelF.add(layers.BatchNormalization())
modelF.add(layers.Dense(16,name=""Layer_4"",activation='relu'))
modelF.add(layers.Dense(8,name=""Layer_5"",activation='softmax'))
modelF.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
modelF.summary()'",No,5,4.0
"Net4 = modelF.fit(
 x_train_norm, y_train,
 epochs= 400, batch_size = 256,
 validation_data = (x_test_norm, y_test))",No,5,7.0
"_, train_acc = modelF.evaluate(x_train_norm, y_train, verbose=0)
_, test_acc = modelF.evaluate(x_test_norm, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot loss during training
plt.rcParams['figure.figsize'] = (12.0, 9.0)
plt.subplot(211)
plt.title('Loss')
plt.plot(Net4.history['loss'], label='train')
plt.plot(Net4.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(Net4.history['acc'], label='train')
plt.plot(Net4.history['val_acc'], label='test')
plt.legend()
plt.show()",Yes,5,35.0
"test_predictions=modelF.predict_classes(x_sub, batch_size=256, verbose=0)",No,5,48.0
"solutions = pd.DataFrame({'Id':dfT.Id, 'Cover_Type':test_predictions})
solutions.to_csv('submission.csv',index=False)",Yes,5,25.0
"#This Python 3 environment comes with many helpful analytics libraries installed
#It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
#For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


#Input data files are available in the ""../input/"" directory.
#For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Any results you write to the current directory are saved as output.",Yes,4,88.0
"df_train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')",No,5,45.0
df_train.dtypes,No,5,70.0
"pd.set_option('display.max_columns', None)
df_train.describe()",No,4,40.0
"df_train = df_train.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)
df_test = df_test.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)

",No,5,10.0
"df_train = df_train.iloc[:,1:]
df_test = df_test.iloc[:,1:]",No,4,13.0
"size = 10
corrmat = df_train.iloc[:, :size].corr()
f, ax = plt.subplots(figsize = (10,8))
sns.heatmap(corrmat, vmax = 0.8, square = True)",No,5,80.0
"data = df_train.iloc[:, :size]
cols = data.columns
#Running pearson coefficient for all combinations
data_corr = data.corr()
threshold = 0.5
corr_list = []",Yes,4,40.0
data_corr,No,4,40.0
"#sorting the highly correlated values
for i in range(0, size):
    for j in range(i+1, size):
        if data_corr.iloc[i, j] >= threshold and data_corr.iloc[i, j]<1\\
        or data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j]<=-threshold:
            corr_list.append([data_corr.iloc[i,j],i,j])'",No,3,9.0
"#Sorting values
s_corr_list = sorted(corr_list, key = lambda x: -abs(x[0]))

#print the higher values
for v, i, j in s_corr_list:
    print(""%s and %s = %.2f"" % (cols[i], cols[j], v))",Yes,5,9.0
"df_train.iloc[:, :10].skew()",No,4,40.0
"from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.25, random_state=42)",Yes,4,13.0
"for v, i, j in s_corr_list:
    sns.pairplot(data = df_train, hue = 'Cover_Type', size = 6, x_vars = cols[i], y_vars = cols[j])
    plt.show()",No,5,33.0
"# A violin plot is a hybrid of a box plot and a kernel density plot, which shows peaks in the data.
cols = df_train.columns
size = len(cols) - 1 # We don't need the target attribute
# x-axis has target attributes to distinguish between classes
x = cols[size]
y = cols[0:size]

for i in range(0, size):
    sns.violinplot(data=df_train, x=x, y=y[i])
    plt.show()",No,5,33.0
"from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1",Yes,4,49.0
df_train.Wilderness_Area2.value_counts(),No,5,72.0
"from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=19, max_features=11,n_jobs=-1, random_state=42)
clf.fit(x_train, y_train)

y_predicted = clf.predict(x_val)",Yes,2,4.0
"### Group one-hot encoded variables of a category into one single variable
cols = df_train.columns
r,c = df_train.shape

# Create a new dataframe with r rows, one column for each encoded category, and target in the end
new_data = pd.DataFrame(index= np.arange(0,r), columns=['Wilderness_Area', 'Soil_Type', 'Cover_Type'])

# Make an entry in data for each r for category_id, target_value
for i in range(0,r):
    p = 0;
    q = 0;
    # Category1_range
    for j in range(10,14):
        if (df_train.iloc[i,j] == 1):
            p = j-9 # category_class
            break
    # Category2_range
    for k in range(14,54):
        if (df_train.iloc[i,k] == 1):
            q = k-13 # category_class
            break
            # Make an entry in data for each r
    new_data.iloc[i] = [p,q,df_train.iloc[i, c-1]]
    
# plot for category1
sns.countplot(x = 'Wilderness_Area', hue = 'Cover_Type', data = new_data)
plt.show()

# Plot for category2
plt.rc(""figure"", figsize = (25,10))
sns.countplot(x='Soil_Type', hue = 'Cover_Type', data= new_data)
plt.show()'",Yes,3,33.0
"from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=200, learning_rate=0.3, max_depth=3,n_jobs=-1, seed=42, objective=""multi:softmax"")
clf.fit(x_train, y_train)
y_predicted = clf.predict(x_val)
accuracy, precision, recall, f1 = get_metrics(y_val, y_predicted)
print(""accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,4.0
"# Checking the value count for different soil_types
for i in range(10, df_train.shape[1]-1):
    j = df_train.columns[i]
    print (df_train[j].value_counts())",No,4,72.0
"from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=200, learning_rate=0.3, max_depth=3,n_jobs=-1, seed=42, objective=""multi:softmax"")
clf.fit(x_train, y_train)
y_predicted = clf.predict(x_val)
accuracy, precision, recall, f1 = get_metrics(y_val, y_predicted)
print(""accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,4.0
"df_train = df_train.drop(['Soil_Type8', 'Soil_Type25'], axis=1)
df_test = df_test.drop(['Soil_Type8', 'Soil_Type25'], axis=1)
df_train1 = df_train # To be used for algos like SVM where we need normalization and StandardScaler
df_test1 = df_test # To be used under normalization and StandardScaler",Yes,5,10.0
"# Checking for data transformation (take only non-categorical values)
df_train.iloc[:,:10].skew()",No,5,40.0
"from scipy import stats
plt.figure(figsize =(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0
df_train1['Horizontal_Distance_To_Hydrology'] = np.sqrt(df_train1['Horizontal_Distance_To_Hydrology']),No,5,8.0
"plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0
"#Vertical_Distance_To_Hydrology
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Vertical_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Vertical_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0
"#Horizontal_Distance_To_Roadways
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)",Yes,5,33.0
df_train1['Horizontal_Distance_To_Roadways'] = np.sqrt(df_train1['Horizontal_Distance_To_Roadways']),No,5,8.0
"# Plot again after sqrt transformation
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)",Yes,5,33.0
"plt.figure(figsize=(8, 6))
sns.distplot(df_train1['Hillshade_9am'], fit=stats.norm)
plt.figure(figsize=(8, 6))
res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)",Yes,5,33.0
df_train['Hillshade_9am'] = np.square(df_train1['Hillshade_9am']),No,5,8.0
"plt.figure(figsize = (8,6))
sns.distplot(df_train['Hillshade_9am'], fit = stats.norm)
fig = plt.figure(figsize = (8,6))
res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)",Yes,5,33.0
"# Hillshade_Noon
fig = plt.figure(figsize=(8,6))
sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt)",Yes,5,33.0
df_train1['Hillshade_Noon'] = np.square(df_train1['Hillshade_Noon']),No,5,8.0
"# Plot again after square transformation
fig = plt.figure(figsize=(8,6))
sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt)
",Yes,5,33.0
"# Horizontal_Distance_To_Fire_Points
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm)
plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)",Yes,5,33.0
df_train1['Horizontal_Distance_To_Fire_Points'] = np.sqrt(df_train1['Horizontal_Distance_To_Fire_Points']),No,5,8.0
"# Plot again after sqrt transformation
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm)
plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)",Yes,5,33.0
"b""# To be used in case of algorithms like SVM\ndf_test1[['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Fire_Points'\\\n        ,'Horizontal_Distance_To_Roadways']] = np.sqrt(df_test1[['Horizontal_Distance_To_Hydrology',\\\n        'Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways']])""",No,5,8.0
"# To be used in case of algorithms like SVM
df_test1[['Hillshade_9am','Hillshade_Noon']] = np.square(df_test1[['Hillshade_9am','Hillshade_Noon']])",No,5,8.0
"#non categorical variables only

Size = 10
X_train_temp = df_train.iloc[:,:Size]
X_test_temp = df_test.iloc[:,:Size]
X_train_temp1 = df_train1.iloc[:,:Size]
X_test_temp1 = df_test1.iloc[:,:Size]

X_train_temp1 = StandardScaler().fit_transform(X_train_temp1)
X_test_temp1 = StandardScaler().fit_transform(X_test_temp1)",Yes,4,14.0
"df_train1.iloc[:,:]",No,5,41.0
"r,c = df_train.shape
X_train = np.concatenate((X_train_temp,df_train.iloc[:,Size:c-1]),axis=1)
X_train1 = np.concatenate((X_train_temp1, df_train1.iloc[:,Size:c-1]), axis=1) # to be used for SVM
y_train = df_train.Cover_Type.values",Yes,5,21.0
"from sklearn import svm
from sklearn.model_selection import train_test_split
#In the new version these are in the model_selection module. Use this: from sklearn.model_selection import learning_curve, GridSearchCV.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV",No,5,22.0
"x_data, x_test_data, y_data, y_test_data = train_test_split(X_train1,y_train,test_size=0.2, random_state=123)
svm_para = [{'kernel':['rbf'],'C': [1,10,100,100]}]",Yes,4,13.0
"classifier = GridSearchCV(svm.SVC(),svm_para,cv=3,verbose=2)
classifier.fit(x_data,y_data)
classifier.best_params_
#classifier.best_score_",Yes,4,7.0
"# Parameters optimized using the code in above cell
#C_opt = 10 # reasonable option
#clf = svm.SVC(C=C_opt,kernel='rbf')
#clf.fit(X_train1,y_train)
classifier.fit(X_train1,y_train)
classifier.score(X_train1,y_train)",Yes,4,7.0
classifier.best_score_,No,5,1.0
classifier.cv_results_,No,5,2.0
df_Test1 = pd.read_csv('../input/forest-cover-type-prediction/test.csv'),No,5,45.0
"from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
x_data, x_test_data, y_data, y_test_data = train_test_split(X_train,y_train,test_size= 0.3, random_state=0)
etc_para = [{'n_estimators': [20, 30, 100], 'max_depth':[5, 10, 15], 'max_features': [0.1, 0.2, 0.3]}]
#default number of features is sqrt(n)
#default number of min_samples_leaf is 1",Yes,3,13.0
"ETC = GridSearchCV(ExtraTreesClassifier(),param_grid=etc_para, cv=10, n_jobs=-1)
ETC.fit(x_data, y_data)
ETC.best_params_
ETC.best_score_",No,5,6.0
"b""print ('Best accuracy obtained: {}'.format(ETC.best_score_))\nprint ('Parameters:')\nfor key, value in ETC.best_params_.items():\n    print('\\t{}:{}'.format(key,value))""",Yes,3,49.0
"# Classification Report
Y_pred = ETC.predict(x_test_data)
target = ['class1', 'class2','class3','class4','class5','class6','class7' ]
print (classification_report(y_test_data, Y_pred, target_names=target))",Yes,4,48.0
"from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(model,title, X, y,n_jobs = 1, ylim = None, cv = None,train_sizes = np.linspace(0.1, 1, 5)):
    
    # Figrue parameters
    plt.figure(figsize=(10,8))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training Examples')
    plt.ylabel('Score')
    
    train_sizes, train_score, test_score = learning_curve(model, X, y, cv = cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    # Calculate mean and std
    train_score_mean = np.mean(train_score, axis=1)
    train_score_std = np.std(train_score, axis=1)
    test_score_mean = np.mean(test_score, axis=1)
    test_score_std = np.std(test_score, axis=1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_score_mean - train_score_std, train_score_mean + train_score_std,\\
                    alpha = 0.1, color = 'r')
    plt.fill_between(train_sizes, test_score_mean - test_score_std, test_score_mean + test_score_std,\\
                    alpha = 0.1, color = 'g')
    
    plt.plot(train_sizes, train_score_mean, 'o-', color=""r"", label=""Training score"")
    plt.plot(train_sizes, test_score_mean, 'o-', color=""g"", label=""Cross-validation score"")
    
    plt.legend(loc = ""best"")
    return plt'",Yes,5,35.0
"b""# 'max_features': 0.3, 'n_estimators': 100, 'max_depth': 15, 'min_samples_leaf: 1'\netc = ExtraTreesClassifier(bootstrap=True, oob_score=True, n_estimators=100, max_depth=10, max_features=0.3, \\\n                           min_samples_leaf=1)\n\netc.fit(X_train, y_train)\n# yy_pred = etc.predict(X_test)\netc.score(X_train, y_train)""",Yes,3,4.0
"r,c = df_test.shape
X_test = np.concatenate((X_test_temp, df_test.iloc[:,Size:c]), axis = 1)
yy_pred = etc.predict(X_test)
solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':yy_pred}, columns = ['Id','Cover_Type'])
solution.to_csv('ETCcover_sol.csv', index=False)",Yes,4,48.0
"# Plotting learning curve
title = 'Learning Curve (ExtraTreeClassifier)'
# cross validation with 50 iterations to have a smoother curve
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
model = etc
plot_learning_curve(model,title,X_train, y_train, n_jobs=-1,ylim=None,cv=cv)
plt.show()",No,5,35.0
"data_test = pd.read_csv(""../input/test.csv"")
final_test = clear_dataset(data_test)",Yes,4,45.0
"y_predicted = clf.predict(x_train)
accuracy, precision, recall, f1 = get_metrics(y_train, y_predicted)
print(""train accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,27.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import keras
from keras.models import Sequential
from keras.layers import *
import keras.backend as K",No,5,22.0
"test = pd.read_csv(""../input/forest-cover-type-prediction/test.csv"")
train = pd.read_csv(""../input/forest-cover-type-prediction/train.csv"")",No,5,45.0
"X_train_full = train.drop(['Id', 'Cover_Type'], axis=1)
y_train_full = train.Cover_Type - 1
X_test = test.drop('Id', axis=1)
test_id = test.Id

print(X_train_full.shape)
print(X_test.shape)",Yes,3,10.0
"print(list(zip(range(0,56), X_train_full.columns)))",No,5,71.0
"scaler = MinMaxScaler()
Xs_train_full = scaler.fit_transform(X_train_full)
Xs_test = scaler.transform(X_test)",Yes,3,4.0
"Xs_train, Xs_valid, y_train, y_valid = train_test_split(Xs_train_full, y_train_full, test_size=0.2, random_state=1, stratify=y_train_full)
print(Xs_train.shape)
print(Xs_valid.shape)",Yes,2,13.0
"temp = LogisticRegression(max_iter=10000)
temp.fit(Xs_train, y_train)
temp.score(Xs_train, y_train)",Yes,2,4.0
"np.random.seed(1)

model = Sequential()
model.add(Dense(512, input_shape=(54,), activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(7, activation='softmax'))
model.summary()",No,5,84.0
"opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
h1 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, 
               validation_data=(Xs_valid, y_valid), verbose=2)",Yes,2,4.0
"K.set_value(model.optimizer.lr, 0.0001)

h2 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, 
               validation_data=(Xs_valid, y_valid), verbose=2)",Yes,4,7.0
"K.set_value(model.optimizer.lr, 0.00001)

h3 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, 
               validation_data=(Xs_valid, y_valid), verbose=2)",Yes,4,7.0
test_pred = model.predict_classes(Xs_test),No,5,48.0
!pip install seaborn,Yes,5,87.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier",No,5,22.0
"submission_sample = pd.read_csv('../input/forest-cover-type-prediction/sampleSubmission.csv')
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')",No,5,45.0
train.sample(5),No,5,41.0
test.sample(5),No,5,41.0
print(list(enumerate(train.columns))),No,5,71.0
train.nunique(),No,5,54.0
"submission = pd.DataFrame({
    'Id':test_id,
    'Cover_Type':test_pred
})
submission.head()",Yes,4,12.0
"submission.to_csv('my_submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"train_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
train_data.head()",Yes,3,45.0
"test_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
test_data.head()",Yes,3,45.0
train_data.info(),No,5,40.0
test_data.info(),No,5,40.0
from sklearn.model_selection import train_test_split,No,5,22.0
"X=train_data.drop(labels=['Id','Cover_Type'],axis=1)
y=train_data['Cover_Type']",Yes,3,10.0
"print(X_train.shape,y_train.shape)
print(X_val.shape,y_val.shape)",Yes,5,58.0
"rfc=RandomForestClassifier(n_estimators=70)
rfc.fit(X_train,y_train)",Yes,3,4.0
"rfc.score(X_val,y_val)",No,5,49.0
"predict=rfc.predict(test_data.drop(labels=['Id'],axis=1))",Yes,3,48.0
"Submission=pd.DataFrame(data=predict,columns=['Cover_Type'])
Submission.head()",Yes,2,12.0
Submission.head(),No,5,41.0
Submission.to_csv('Submission.csv'),No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

pd.set_option('display.max_columns', 100)
pd.options.mode.chained_assignment = None",Yes,4,22.0
"dtrain = pd.read_csv(train_path, index_col=0)
dtest = pd.read_csv(test_path, index_col=0)",No,5,45.0
dtrain['Cover_Type'].value_counts(),No,5,72.0
dtrain.info(),No,5,40.0
"# Now this includes values for all classes, better to groupyby the target variable and then get description.
dtrain.describe()",No,5,40.0
"X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10,test_size=0.2,shuffle =True)",No,5,13.0
"logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)",Yes,4,7.0
"# fit the model on the whole dataset
random_forest = RandomForestClassifier()

random_forest.fit(X_train, y_train)",No,5,7.0
"decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = None, 
                                           splitter='best', 
                                           random_state=10)

decisionTreeModel.fit(X_train,y_train)",No,5,7.0
"KNeighborsModel = KNeighborsClassifier(n_neighbors = 7,
                                       weights = 'distance',
                                      algorithm = 'brute')

KNeighborsModel.fit(X_train,y_train)",No,5,7.0
"bernoulliNBModel = BernoulliNB(alpha=0.1)
bernoulliNBModel.fit(X_train,y_train)",No,5,7.0
"gaussianNBModel = GaussianNB()
gaussianNBModel.fit(X_train,y_train)",No,5,7.0
"XGB_Classifier = XGBClassifier()
XGB_Classifier.fit(X_train, y_train)",No,5,7.0
"#evaluation Details
models = [logistic_regression, random_forest, decisionTreeModel, KNeighborsModel, 
            bernoulliNBModel, gaussianNBModel, XGB_Classifier]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test,y_pred))
    print('--------------------------------------------------------------------------')",No,3,48.0
y_pred = XGB_Classifier.predict(X_test),No,5,48.0
"import seaborn as sn

confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)",No,5,80.0
"b""from sklearn.metrics import accuracy_score,classification_report\n\nprint(accuracy_score(y_test,y_pred).round(4)*100,'\\n')\n\nprint(pd.crosstab(y_test,y_pred),'\\n')\n\nprint(classification_report(y_test,y_pred),'\\n')""",No,5,49.0
X_test.shape,No,5,58.0
test_predict = XGB_Classifier.predict(test_to_pred),No,5,48.0
"test.reset_index(inplace = True)
test.head()",No,5,61.0
predict = test['EventId'],No,3,21.0
"predict = pd.concat([predict,test_predict], axis=1)",No,5,11.0
"predict.to_csv(""submission.csv"",index=False)",No,5,25.0
predict.tail(200),No,5,41.0
sb.countplot(predict.Class),No,5,33.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"train_data = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
train_data.head()",Yes,4,45.0
"test_data = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
test_data.head()",Yes,4,45.0
"import seaborn as sns
plt.figure(figsize=(15,10))
sns.countplot(train['Cover_Type'])
plt.xlabel(""Type of Cpver"", fontsize=12)
plt.ylabel(""Rows Count"", fontsize=12)
plt.show()'",No,4,81.0
"# Bivariate EDA
pd.crosstab(train.Soil_Type31, train.Cover_Type)",No,5,40.0
"#Convert dummy features back to categorical
x = train.iloc[:,15:55]
y = train.iloc[:,11:15]
y = pd.DataFrame(y)
x = pd.DataFrame(x)
s2 = pd.Series(x.columns[np.where(x!=0)[1]])
s3 = pd.Series(y.columns[np.where(y!=0)[1]])
train['soil_type'] = s2
train['Wilderness_Area'] = s3
train.head()",Yes,4,8.0
"# Create a new dataset exluding dummies variable for Mutivariate EDA
df_viz = train.iloc[:, 0:15]
df_viz = df_viz.drop(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 
                      'Wilderness_Area4'], axis = 1)
df_viz.head()",Yes,3,10.0
train_data['Slope'].plot(kind='hist'),No,5,33.0
test_data['Elevation'].plot(kind='hist'),No,5,33.0
train_data['Cover_Type'].value_counts()#from results it is visible that nothing is over sampled or under sampled,No,5,72.0
from sklearn.model_selection import train_test_split ,No,5,22.0
"X_train, X_val, y_train,y_val = train_test_split(X,y,random_state=40)",No,5,13.0
rfc=RandomForestClassifier(n_estimators=70),No,5,4.0
"rfc.fit(X_train,y_train)",No,5,7.0
"submission = pd.DataFrame(data=predict,columns=['Cover_Type'])
submission.head()",No,3,12.0
"submission['Id'] = test_data['Id']
submission.set_index('Id',inplace=True)",No,5,55.0
submission.to_csv('Submission.csv'),No,5,25.0
"import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV",No,5,22.0
"filepath= '/kaggle/input/forest-cover-type-prediction/train.csv'
filepath1= '/kaggle/input/forest-cover-type-prediction/test.csv'
testdata= pd.read_csv(filepath1)
testdata2=testdata
traindata= pd.read_csv(filepath)
traindata.head()",No,4,45.0
"#We remove the id column in both the training and testing datasets.
traindata=traindata.drop('Id',axis=1)
testdata=testdata.drop('Id',axis=1)",No,5,10.0
"#working with numeric features (They are all numerical features)
numeric_features = traindata.select_dtypes(include=[np.number])
numeric_features.dtypes",No,5,70.0
"#We will define the training and testing data here:

y=traindata['Cover_Type']
x=traindata.drop('Cover_Type',axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",No,5,13.0
"##Now we will run a few machine learning techiniques to see which one is the most applicable

#Linear Regression
linearRegressor = LinearRegression()
linearRegressor.fit(x_train, y_train)
y_predicted = linearRegressor.predict(x_test)
mse = mean_squared_error(y_test, y_predicted)
r = r2_score(y_test, y_predicted)
mae = mean_absolute_error(y_test,y_predicted)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train);
y_predicted_r = rf.predict(x_test)
mse = mean_squared_error(y_test, y_predicted_r)
r = r2_score(y_test, y_predicted_r)
mae = mean_absolute_error(y_test,y_predicted_r)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"# Decision Tree - CART
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train, y_train)
y_predicted_d = regressor.predict(x_test)
mse = mean_squared_error(y_test, y_predicted_d)
r = r2_score(y_test, y_predicted_d)
mae = mean_absolute_error(y_test,y_predicted_d)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"#Polynomial Regression
polynomial_features= PolynomialFeatures(degree=2)
x_poly = polynomial_features.fit_transform(x_train)
x_poly_test = polynomial_features.fit_transform(x_test)
model = LinearRegression()
model.fit(x_poly, y_train)
y_predicted_p = model.predict(x_poly_test)
mse = mean_squared_error(y_test, y_predicted_p)
r = r2_score(y_test, y_predicted_p)
mae = mean_absolute_error(y_test,y_predicted_p)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"#Ridge Regression
ridgereg = Ridge(normalize=True)
ridgereg.fit(x_train, y_train)
y_pred = ridgereg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"# LGBMClassifier
lgb_clf = LGBMClassifier(random_state=17)
lgb_clf.fit(x_train, y_train)
y_pred = lgb_clf.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,4,7.0
"#GridSearchCV
param_grid = {'num_leaves': [7, 15, 31, 63], 
              'max_depth': [3, 4, 5, 6, -1]}
grid_searcher = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, 
                             cv=5, verbose=1, n_jobs=4)
grid_searcher.fit(x_train, y_train)
mse = mean_squared_error(y_test, y_pred)
r = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)'",Yes,4,6.0
"# Random Forest
rf = RandomForestClassifier()
rf.fit(x,y);
Prediction = rf.predict(testdata)",Yes,4,7.0
"predictionlist=Prediction.tolist()
Passengerid=testdata2['Id'].tolist() 
output=pd.DataFrame(list(zip(Passengerid, predictionlist)),
              columns=['Id','Cover_type'])
output.head()
output.to_csv('my_submission(ForestCoverTypePrediction).csv', index=False)",Yes,4,25.0
"train_data = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/train.csv"")
train_data.shape",No,4,45.0
"test_data = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/test.csv"")
test_data.shape",No,4,45.0
"train_data.columns
",No,5,71.0
test_data.columns,No,5,71.0
train_data['Cover_Type'].value_counts(),No,5,72.0
"from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y)",No,5,13.0
"print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)",Yes,4,14.0
"from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)",No,3,7.0
"pred = knn.predict(test_data.drop(""Id"",axis=1))",Yes,4,7.0
"submission = pd.DataFrame(data=pred,columns=[""Cover_Type""])
submission[""Id""] = test_data[""Id""]
submission.set_index(""Id"",inplace=True)",No,5,55.0
"submission.to_csv(""Submission.csv"")",No,5,25.0
"import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,auc
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split",No,5,22.0
"df_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
",No,5,45.0
df_train.shape,No,5,58.0
"test_id =df_test['Id']
train_id = df_train ['Id']",No,5,77.0
df_train.isnull().sum(),No,5,39.0
df_test.isnull().sum(),No,5,39.0
df_test.columns,No,5,71.0
df_test.dtypes,No,5,70.0
"# From both train and test data
df_train.drop(['Id'], axis = 1,inplace = True)
df_test.drop(['Id'], axis = 1,inplace = True)",No,5,10.0
sns.heatmap(df_train.isnull()),No,5,80.0
sns.heatmap(df_test.isnull()),No,5,80.0
"corrmat = df_train.corr()
sns.heatmap(corrmat,vmax = 0.8,square = True)",No,5,80.0
data_corr.head(),No,5,41.0
"plt.figure(figsize=(15,10))
pd.crosstab(train.Wilderness_Area, train.Cover_Type).plot.barh(figsize=(15,15),stacked = True)",No,5,33.0
"plt.figure(figsize=(15,10))
pd.crosstab(train.soil_type, train.Cover_Type).plot.barh(figsize=(15,15),stacked = True)",No,5,33.0
"plt.subplots(figsize=(10,10))
corr = df_viz.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
",No,5,80.0
"def add_feature(data):   
    data['Ele_minus_VDtHyd'] = data.Elevation-data.Vertical_Distance_To_Hydrology
    data['Ele_plus_VDtHyd'] = data.Elevation+data.Vertical_Distance_To_Hydrology
    data['Distanse_to_Hydrolody'] = (data['Horizontal_Distance_To_Hydrology']**2+data['Vertical_Distance_To_Hydrology']**2)**0.5
    data['Hydro_plus_Fire'] = data['Horizontal_Distance_To_Hydrology']+data['Horizontal_Distance_To_Fire_Points']
    data['Hydro_minus_Fire'] = data['Horizontal_Distance_To_Hydrology']-data['Horizontal_Distance_To_Fire_Points']
    data['Hydro_plus_Road'] = data['Horizontal_Distance_To_Hydrology']+data['Horizontal_Distance_To_Roadways']
    data['Hydro_minus_Road'] = data['Horizontal_Distance_To_Hydrology']-data['Horizontal_Distance_To_Roadways']
    data['Fire_plus_Road'] = data['Horizontal_Distance_To_Fire_Points']+data['Horizontal_Distance_To_Roadways']
    data['Fire_minus_Road'] = data['Horizontal_Distance_To_Fire_Points']-data['Horizontal_Distance_To_Roadways']
    return data",No,5,8.0
"train = add_feature(train)
test = add_feature(test)",No,4,8.0
"X_train = train.drop(['Id','Cover_Type','soil_type','Wilderness_Area'], axis = 1)
y_train = train.Cover_Type
X_test = test.drop(['Id'], axis = 1)",No,5,21.0
"%%time 

lr_pipe = Pipeline(
    steps = [
        ('scaler', MinMaxScaler()),
        ('classifier', LogisticRegression(solver='lbfgs', n_jobs=-1))
    ]
)

lr_param_grid = {
    'classifier__C': [1, 10, 100,1000],
}


np.random.seed(1)
grid_search = GridSearchCV(lr_pipe, lr_param_grid, cv=5, refit='True')
grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)",No,4,6.0
"%%time 

rf_pipe = Pipeline(
    steps = [
        ('classifier', RandomForestClassifier(n_estimators=500))
    ]
)

param_grid = {
    'classifier__min_samples_leaf': [2, 3, 4, 8],
    'classifier__max_depth': [30, 32, 34],
}

np.random.seed(1)
rf_grid_search = GridSearchCV(rf_pipe, param_grid, cv=5, refit='True', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

print(rf_grid_search.best_score_)
print(rf_grid_search.best_params_)",No,5,6.0
"rf_model = rf_grid_search.best_estimator_

cv_score = cross_val_score(rf_model, X_train, y_train, cv = 5)
print(cv_score)
print(""Accuracy: %0.2f (+/- %0.2f)"" % (cv_score.mean(), cv_score.std() * 2))",Yes,5,1.0
rf = rf_grid_search.best_estimator_.steps[0][1],No,3,2.0
"feat_imp = rf.feature_importances_
feat_imp_df = pd.DataFrame({
    'feature':X_train.columns,
    'feat_imp':feat_imp
})

feat_imp_df.sort_values(by='feat_imp', ascending=False).head(10)",Yes,5,79.0
"sorted_feat_imp_df = feat_imp_df.sort_values(by='feat_imp', ascending=True)
plt.figure(figsize=[6,6])
plt.barh(sorted_feat_imp_df.feature[-20:], sorted_feat_imp_df.feat_imp[-20:])
plt.show()",No,5,79.0
"%%time 

xgd_pipe = Pipeline(
    steps = [
        ('classifier', XGBClassifier(n_estimators=50, subsample=0.5))
    ]
)

param_grid = {
    'classifier__learning_rate' : [0.45],
    'classifier__min_samples_split' : [8, 16, 32],
    'classifier__min_samples_leaf' : [2],
    'classifier__max_depth': [15]
    
}

np.random.seed(1)
xgd_grid_search = GridSearchCV(xgd_pipe, param_grid, cv=5,
                              refit='True', verbose = 10, n_jobs=-1)
xgd_grid_search.fit(X_train, y_train)

print(xgd_grid_search.best_score_)
print(xgd_grid_search.best_params_)",Yes,4,6.0
"xgd_model = xgd_grid_search.best_estimator_

cv_score = cross_val_score(xgd_model, X_train, y_train, cv = 5)
print(cv_score)
print(""Accuracy: %0.2f (+/- %0.2f)"" % (cv_score.mean(), cv_score.std() * 2))",Yes,5,28.0
final_model = xgd_grid_search.best_estimator_.steps[0][1],No,5,3.0
"final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)",Yes,4,7.0
print(len(test.Id)),No,5,40.0
print(len(y_pred)),No,5,40.0
"from collections import Counter
Counter(y_pred)",Yes,5,72.0
submission_sample.head(),No,5,41.0
"submission = pd.DataFrame({'Id': test.Id,
                           'Cover_Type': y_pred})
submission.head()",Yes,5,55.0
test_data['Slope'].plot(kind='hist'),No,5,33.0
train_data['Elevation'].plot(kind='hist'),No,5,33.0
"X=train_data.drop(labels=['Id','Cover_Type'],axis=1)",No,5,10.0
y=train_data['Cover_Type'],No,5,21.0
"X_train,X_val,y_train,y_val=train_test_split(X,y,random_state=40)",No,5,13.0
"rfc=RandomForestClassifier(n_estimators=70)
rfc.fit(X_train,y_train)
",Yes,5,7.0
"Submission=pd.DataFrame(data=predict,columns=['Cover_type'])
Submission.head()",Yes,5,55.0
"Submission['Id']=test_data['Id']
Submission.set_index('Id',inplace=True)
Submission.head()",Yes,5,55.0
"dataset_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
dataset_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
dataset_train_copy = dataset_train.copy()
dataset_test_copy = dataset_test.copy()",No,4,45.0
dataset_train.shape,No,5,58.0
dataset_test.shape,No,5,58.0
"dataset_train_copy.drop('Id', axis=1, inplace=True)
dataset_test_copy.drop('Id', axis=1, inplace=True)",No,5,10.0
"X = dataset_train_copy.iloc[:, :-1].values
y = dataset_train_copy.iloc[:, -1].values
X_submission = dataset_test_copy.iloc[:, :].values",No,3,21.0
"from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X)
X_submission = sc.transform(X_submission)
y_train = y",Yes,5,18.0
"from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)",Yes,5,7.0
"y_submission = classifier.predict(X_submission)
dataset_submission = pd.DataFrame({'Id':dataset_test.iloc[:,0], 'Cover_Type': y_submission})
dataset_submission.set_index('Id', inplace=True)",Yes,5,55.0
dataset_submission,No,5,41.0
dataset_submission.to_csv('Submission.csv'),No,5,25.0
"train_df=pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
train_df.head()",Yes,4,45.0
"test_df=pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
test_df.head()",Yes,4,45.0
train_df.shape,No,5,58.0
test_df.shape,No,5,58.0
test_id=test_df['Id'],No,3,14.0
"train_df.drop(['Id'],axis=1,inplace=True)
test_df.drop(['Id'],axis=1,inplace=True)",No,5,10.0
"X=train_df.drop(['Cover_Type'],axis=1)
y=train_df['Cover_Type']",No,5,21.0
"X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=40)",No,5,13.0
"from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(n_neighbors=6)
KNN.fit(X_train,y_train)",Yes,5,7.0
"KNN.score(X_test,y_test)",No,5,49.0
pred=KNN.predict(test_df),No,5,48.0
"result=pd.DataFrame(data=pred,columns=['Cover_Type'])",No,5,12.0
"result['Id']=test_id
result.set_index('Id',inplace=True)
result.head()",Yes,3,55.0
result.to_csv('Submission.csv'),No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",Yes,5,88.0
"test_data  = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
test_data.head()",Yes,4,45.0
"X = train_data.drop(labels = ['Id','Cover_Type'],axis = 1)
Y = train_data['Cover_Type']",No,5,21.0
"X_train,X_val,Y_train,Y_val = train_test_split(X,Y,random_state = 40)",No,5,13.0
"print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)",No,5,58.0
"rfc=RandomForestClassifier(n_estimators=70)
rfc.fit(X_train,Y_train)",Yes,5,7.0
"rfc.score(X_val,Y_val)",No,5,49.0
"Submission['Id']=test_data['Id']
Submission.set_index('Id',inplace=True)",No,5,55.0
Submission.to_csv('Submission_first_time.csv'),No,5,25.0
train_data.columns,No,5,71.0
"KNN = KNeighborsClassifier(n_neighbors = 11, n_jobs = -1)
KNN.fit(X_train,y_train)",Yes,5,7.0
"KNN.score(X_val,y_val)",No,5,49.0
"predict=KNN.predict(test_data.drop(labels=['Id'],axis=1))",No,5,48.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"Submission['Id']=test_data['Id']
Submission.set_index('Id',inplace=True)
",No,5,55.0
"import numpy as np 
import pandas as pd 
from sklearn import cross_validation, grid_search, linear_model, metrics, pipeline, preprocessing",No,5,22.0
"def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))",No,5,84.0
"data = pd.read_csv(""../input/train.csv"")",No,5,45.0
data.head(3),No,5,41.0
data.isnull().values.any(),No,4,39.0
"data.datetime = data.datetime.apply(pd.to_datetime)
data['month'] = data.datetime.apply(lambda x : x.month)
data['hour'] = data.datetime.apply(lambda x : x.hour)
data.head()",No,4,8.0
"train_data = data.iloc[:-1000, :]
test_data = data.iloc[-1000:, :]
print(data.shape, train_data.shape, test_data.shape)
",Yes,4,13.0
"train_labels = train_data['count'].values
train_data = train_data.drop(['datetime', 'count', 'casual', 'registered'], axis = 1)
test_labels = test_data['count'].values
test_data = test_data.drop(['datetime', 'count', 'casual', 'registered'], axis = 1)",No,4,21.0
"binary_data_columns = ['holiday', 'workingday']
binary_data_indices = np.array([(column in binary_data_columns) for column in train_data.columns], dtype = bool)

categorical_data_columns = ['season', 'weather', 'month'] 
categorical_data_indices = np.array([(column in categorical_data_columns) for column in train_data.columns], dtype = bool)

numeric_data_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'hour']
numeric_data_indices = np.array([(column in numeric_data_columns) for column in train_data.columns], dtype = bool)",No,4,37.0
"transformer_list = [        
            #binary
            ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data[:, binary_data_indices])), 
                    
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0))            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_data_indices])),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ]",No,5,8.0
regressor = linear_model.Lasso(max_iter = 2000),No,5,4.0
"estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list=transformer_list)),
    ('model_fitting', regressor)
    ]
)

estimator.fit(train_data, train_labels)
predicted = estimator.predict(test_data)

print(""RMSLE: "", rmsle(test_labels, predicted))
print(""MAE: "",  metrics.mean_absolute_error(test_labels, predicted))'",Yes,4,49.0
"parameters_grid = {
    'model_fitting__alpha' : [0.1, 1, 2, 3, 4, 10, 30]
}",No,5,5.0
"rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring = rmsle_scorer, cv = 4)
grid_cv.fit(train_data, train_labels)

predicted = grid_cv.best_estimator_.predict(test_data)

print(""RMSLE: "", rmsle(test_labels, predicted))
#print(""MAE: "",  metrics.mean_absolute_error(test_labels, predicted))
print(""Best params: "", grid_cv.best_params_)",No,5,2.0
estimator.get_params().keys(),No,5,79.0
"from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(random_state = 0, max_depth = 20, n_estimators = 150)
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = transformer_list)),
    ('model_fitting', regressor)
    ]
)
estimator.fit(train_data, train_labels)
#metrics.mean_absolute_error(test_labels, estimator.predict(test_data))
print(""RMSLE: "", rmsle(test_labels, estimator.predict(test_data)))'",Yes,4,49.0
"%pylab inline
pylab.figure(figsize=(8, 3))

pylab.subplot(1,2,1)
pylab.grid(True)
pylab.xlim(-100,1100)
pylab.ylim(-100,1100)
pylab.scatter(train_labels, grid_cv.best_estimator_.predict(train_data), alpha=0.5, color = 'red')
pylab.scatter(test_labels, grid_cv.best_estimator_.predict(test_data), alpha=0.5, color = 'blue')
pylab.title('linear model')

pylab.subplot(1,2,2)
pylab.grid(True)
pylab.xlim(-100,1100)
pylab.ylim(-100,1100)
pylab.scatter(train_labels, estimator.predict(train_data), alpha=0.5, color = 'red')
pylab.scatter(test_labels, estimator.predict(test_data), alpha=0.5, color = 'blue')
pylab.title('random forest model')",No,5,56.0
"from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.9, max_depth = 4)

estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = transformer_list)),
    ('model_fitting', gbr)
    ]
)
estimator.fit(train_data, train_labels)
#metrics.mean_absolute_error(test_labels, estimator.predict(test_data))
print(""RMSLE: "", rmsle(test_labels, estimator.predict(test_data)))'",Yes,4,49.0
"%pylab inline
pylab.figure(figsize=(8, 3))

pylab.subplot(1,2,1)
pylab.grid(True)
pylab.xlim(-100,1100)
pylab.ylim(-100,1100)
pylab.scatter(train_labels, grid_cv.best_estimator_.predict(train_data), alpha=0.5, color = 'red')
pylab.scatter(test_labels, grid_cv.best_estimator_.predict(test_data), alpha=0.5, color = 'blue')
pylab.title('linear model')

pylab.subplot(1,2,2)
pylab.grid(True)
pylab.xlim(-100,1100)
pylab.ylim(-100,1100)
pylab.scatter(train_labels, estimator.predict(train_data), alpha=0.5, color = 'red')
pylab.scatter(test_labels, estimator.predict(test_data), alpha=0.5, color = 'blue')
pylab.title('gbr model')",No,5,56.0
"real_test_data = pd.read_csv(""../input/test.csv"")
real_test_data_ids = real_test_data[""datetime""]
real_test_data.head()",No,4,45.0
"real_test_data.datetime = real_test_data.datetime.apply(pd.to_datetime)
real_test_data['month'] = real_test_data.datetime.apply(lambda x : x.month)
real_test_data['hour'] = real_test_data.datetime.apply(lambda x : x.hour)
real_test_data.head()",No,4,8.0
"real_test_data = real_test_data.drop(['datetime'], axis = 1)",No,5,10.0
real_test_predictions = estimator.predict(real_test_data),No,5,48.0
"submission.to_csv('bike_predictions.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.
'",No,5,88.0
"# load train_set&test_set
train=pd.read_csv(""../input/train.csv"")
test=pd.read_csv(""../input/test.csv"")
test['casual']=0
test['registered']=0
test['count']=0
#remove Outlier piont
train = train[np.abs(train[""count""]-train[""count""].mean())<=(3*train[""count""].std())] '",Yes,4,45.0
"#create a union data
union_data=pd.concat([train,test],ignore_index=True)",No,5,11.0
"#add date columns
union_data['day']=pd.to_datetime(union_data.datetime).dt.day
union_data['year']=pd.to_datetime(union_data.datetime).dt.year
union_data['month']=pd.to_datetime(union_data.datetime).dt.month
union_data['weekday']=pd.to_datetime(union_data.datetime).dt.weekday
union_data['date']=pd.to_datetime(union_data.datetime).dt.date
union_data['hour']=pd.to_datetime(union_data.datetime).dt.hour
union_data['year_season']=union_data.apply(lambda x:'{}_{}'.format(str(x['year']),str(x['season'])),axis=1)
union_data['year_month']=union_data.apply(lambda x:'{}_{}'.format(str(x['year']),str(x['month'])),axis=1)
#missing data fill
union_data['windspeed']=union_data[['year','month','hour','windspeed']].groupby(['year','month','hour']).transform(lambda x:x.replace(0,np.median([i for i in x if i>0])))
union_data['windspeed']=pd.cut(union_data['windspeed'],bins=[0,20,60],labels=['0','1'])
",Yes,4,16.0
"#add day_type columns
union_data['day_type']=0
union_data['day_type'][(union_data['holiday']==0)& (union_data['workingday']==0)]='weekend'
union_data['day_type'][(union_data['holiday']==0)& (union_data['workingday']==1)]='workingday'
union_data['day_type'][(union_data['holiday']==1)]='holiday'
",No,5,20.0
"#create train set
train=union_data[:10739]",No,4,13.0
"#windspeed counts
plt.figure(figsize=(100,5))
g=sns.factorplot(x='windspeed',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0
"#season trend
g=sns.factorplot(x='season',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0
"#month trend
g=sns.factorplot(x='month',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0
"#day trend
g=sns.factorplot(x='day',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0
"#weekday trend
g=sns.factorplot(x='weekday',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0
"#hour trend
g=sns.factorplot(x='hour',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0
"#weather analyse
g=sns.factorplot(x='weather',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0
"#workingday analyse
g=sns.factorplot(x='workingday',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0
"#tempture analyse
g=sns.factorplot(x='temp',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0
"from sklearn import tree
clf = tree.tree.DecisionTreeRegressor(max_depth=4,criterion='mse',min_samples_leaf=800)
clf = clf.fit(train['hour'].reshape(-1,1),np.ravel(train['count']))
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=['hour'],  
                                filled=True, rounded=True,  
                                special_characters=True,) 
graph = graphviz.Source(dot_data) 
graph
#dot_data = tree.export_graphviz(clf, out_file=None,feature_names=train[['hour']].columns.values,class_names=train[['count']].columns.values) 
#graph = graphviz.Source(dot_data)  
#graph 
",No,3,7.0
train_X.columns,No,5,71.0
"regr = RandomForestRegressor(n_estimators=300)
regr.fit(train_X.loc[:,'year_month':], np.ravel(train_y))
reg=GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01,max_depth=4)
reg.fit(train_X.loc[:,'year_month':], np.ravel(train_y))
",No,5,7.0
"np.exp(regr.predict(test_X.loc[:,'year_month':]))-1
",No,5,55.0
"np.exp(reg.predict(test_X.loc[:,'year_month':]))-1",No,5,48.0
"union_data['count'][10739:]=np.exp(reg.predict(test_X.loc[:,'year_month':]))-1",No,5,8.0
"submission=pd.DataFrame({
        ""datetime"": union_data[10739:].datetime,
        ""count"": union_data[10739:]['count']
    })
submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)'",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from datetime import datetime
from scipy import stats",Yes,4,22.0
"trainData = pd.read_csv('../input/train.csv')
testData = pd.read_csv('../input/test.csv')",No,5,45.0
trainData.head(2),No,5,41.0
"fig, axes = plt.subplots(figsize=(15, 4), ncols=2, nrows=1)
sn.distplot(trainData[""count""],ax=axes[0])
plt.plot(pd.rolling_mean(trainData['count'], 100))
plt.show()'",No,5,33.0
"trainData['logcount'] = trainData['count'].apply(lambda x: np.log1p(x))
fig, axes = plt.subplots(figsize=(15, 8))
sn.distplot(trainData[""logcount""], ax=axes)'",No,5,33.0
"trainData['date'] = trainData.datetime.apply(lambda x : x.split()[0])
trainData['hour'] = trainData.datetime.apply(lambda x : x.split()[1].split("":"")[0])
trainData['weekday'] = trainData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())
trainData['month'] = trainData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month)

testData['date'] = testData.datetime.apply(lambda x : x.split()[0])
testData['hour'] = testData.datetime.apply(lambda x : x.split()[1].split("":"")[0])
testData['weekday'] = testData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())
testData['month'] = testData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month)

timeColumn = testData['datetime']'",No,5,8.0
"import xgboost as xgb

X = trainData.drop(['count', 'datetime', 'registered', 'casual', 'date', 'logcount'], axis=1).values
Y = trainData['logcount'].values

testX = testData.drop(['datetime', 'date'], axis=1).values

trainMatrix = xgb.DMatrix(X, label=Y)

max_depth = 5
min_child_weight = 8
subsample = 0.9
num_estimators = 1000
learning_rate = 0.1

clf = xgb.XGBRegressor(max_depth=max_depth,
                min_child_weight=min_child_weight,
                subsample=subsample,
                n_estimators=num_estimators,
                learning_rate=learning_rate)

clf.fit(X,Y)

pred = clf.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        ""datetime"": timeColumn,
        ""count"": pred
    })
submission.to_csv('XGBNoFE.csv', index=False)'",Yes,2,25.0
"fig, axes = plt.subplots(nrows=1,ncols=2)
fig.set_size_inches(15, 8)
sn.boxplot(data=trainData, y='count', x='season', ax=axes[0])
sn.boxplot(data=trainData, y='count', x='workingday', ax=axes[1])
axes[0].set(xlabel='season', ylabel='count')
axes[1].set(xlabel='workingday', ylabel='count')",No,5,33.0
"fix, axes = plt.subplots(figsize=(15, 10))
sn.boxplot(data=trainData, y='count', x='hour', ax=axes)",No,5,75.0
"corrMat = trainData.corr()
mask = np.array(corrMat)
mask[np.tril_indices_from(mask)] = False
fig, ax= plt.subplots(figsize=(20, 10))
sn.heatmap(corrMat, mask=mask,vmax=1., square=True,annot=True)",No,5,80.0
"fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(15, 15))

meanMonthly = pd.DataFrame(trainData.groupby('month')['count'].mean()).reset_index().sort_values(by='count', ascending=False)
sn.barplot(data=meanMonthly, x='month', y='count', ax=axes[0])
axes[0].set(xlabel='month', ylabel='count')

hoursSeasonly = pd.DataFrame(trainData.groupby(['hour', 'season'], sort=True)['count'].mean()).reset_index()
sn.pointplot(x=hoursSeasonly['hour'], y=hoursSeasonly['count'], hue=hoursSeasonly['season'], data=hoursSeasonly, join=True, ax=axes[1])
axes[1].set(xlabel='hour', ylabel='count')

hoursDayly = pd.DataFrame(trainData.groupby(['hour','weekday'], sort=True)['count'].mean()).reset_index()
sn.pointplot(x=hoursDayly['hour'], y=hoursDayly['count'], hue=hoursDayly['weekday'], data=hoursDayly, join=True,ax=axes[2])
axes[2].set(xlabel='hour', ylabel='count')

hoursSeasonly = pd.DataFrame(trainData.groupby(['hour', 'month'], sort=True)['count'].mean()).reset_index()
sn.pointplot(x=hoursSeasonly['hour'], y=hoursSeasonly['count'], hue=hoursSeasonly['month'], data=hoursSeasonly, join=True, ax=axes[3])
axes[1].set(xlabel='hour', ylabel='count')",No,5,75.0
"X = trainData.drop(['date', 'temp', 'casual', 'registered', 'logcount', 'datetime', 'count'], axis=1)

season_df = pd.get_dummies(trainData['season'], prefix='s', drop_first=True)
weather_df = pd.get_dummies(trainData['weather'], prefix='w', drop_first=True)
hour_df = pd.get_dummies(trainData['hour'], prefix='h', drop_first=True)
weekday_df = pd.get_dummies(trainData['weekday'], prefix='d', drop_first=True)
month_df = pd.get_dummies(trainData['month'], prefix='m', drop_first=True)

X = X.join(season_df)
X = X.join(weather_df)
X = X.join(hour_df)
X = X.join(weekday_df)
X = X.join(month_df)

X = X.values
Y=trainData['logcount'].values
print(X.shape)

testX = testData.drop(['date', 'temp', 'datetime'], axis=1)

season_df = pd.get_dummies(testData['season'], prefix='s', drop_first=True)
weather_df = pd.get_dummies(testData['weather'], prefix='w', drop_first=True)
hour_df = pd.get_dummies(testData['hour'], prefix='h', drop_first=True)
weekday_df = pd.get_dummies(testData['weekday'], prefix='d', drop_first=True)
month_df = pd.get_dummies(testData['month'], prefix='m', drop_first=True)

testX = testX.join(season_df)
testX = testX.join(weather_df)
testX = testX.join(hour_df)
testX = testX.join(weekday_df)
testX = testX.join(month_df)

testX = testX.values
print(testX.shape)",Yes,4,20.0
"clf=xgb.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4,colsample_bytree=0.6,subsample=0.6)
clf.fit(X,Y)

pred = clf.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        ""datetime"": timeColumn,
        ""count"": pred
    })
submission.to_csv('XGBwithFE.csv', index=False)'",Yes,4,25.0
"from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
",No,5,22.0
"def loss_func(truth, prediction):
    truth = np.expm1(truth)
    prediction = np.expm1(prediction)
    log1 = np.array([np.log(x + 1) for x in truth])
    log2 = np.array([np.log(x + 1) for x in prediction])
    return np.sqrt(np.mean((log1 - log2)**2))",No,5,84.0
"b""param_grid = {\n    'n_estimators': [50, 80, 100, 120],\n    'max_depth': [None, 1, 2, 5],\n    'max_features': ['sqrt', 'log2', 'auto']\n}\n\nscorer = make_scorer(loss_func, greater_is_better=False)\n\nregr = RandomForestRegressor(random_state=42)\n\nrfr = GridSearchCV(regr, param_grid, cv=4, scoring=scorer, n_jobs=4).fit(X, Y)\nprint('\\tParams:', rfr.best_params_)\nprint('\\tScore:', rfr.best_score_)""",No,4,2.0
"pred = rfr.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        ""datetime"": timeColumn,
        ""count"": pred
    })
submission.to_csv('RandomForest.csv', index=False)'",Yes,4,48.0
"b""#\n#param_grid = {\n#    'learning_rate': [0.1, 0.01, 0.001, 0.0001],\n#    'n_estimators': [100, 1000, 1500, 2000, 4000],\n#    'max_depth': [1, 2, 3, 4, 5, 8, 10]\n#}\n#\n#scorer = make_scorer(loss_func, greater_is_better=False)\n#\n#gb = GradientBoostingRegressor(random_state=42)\n#\n#gbr = GridSearchCV(gb, param_grid, cv=4, scoring=scorer, n_jobs=3).fit(X, Y)\n#print('\\tParams:', gbr.best_params_)\n#print('\\tScore:', gbr.best_score_)\n\ngbr = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01, max_depth=4)\n\ngbr.fit(X, Y)""",No,5,7.0
"pred = gbr.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        ""datetime"": timeColumn,
        ""count"": pred
    })
submission.to_csv('GradientBoost.csv', index=False)'",Yes,4,48.0
"df = pd.read_csv('../input/train.csv')
df.head()",No,4,45.0
"def null_percentage(column):
    df_name = column.name
    nans = np.count_nonzero(column.isnull().values)
    total = column.size
    frac = nans / total
    perc = int(frac * 100)
    print('%d%% of values or %d missing from %s column.' % (perc, nans, df_name))

def check_null(df, columns):
    for col in columns:
        null_percentage(df[col])
        
check_null(df, df.columns)",No,5,39.0
"def process_features(df):
    
    # Get month, day of month, and time of day. 
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
              'August', 'September', 'October', 'November', 'December']
    df['month'] = df.datetime.apply(lambda x: months[int(x[5:7]) - 1])
    df['day'] = df.datetime.apply(lambda x: x[8:10]).astype(int)
    df['hour'] = df.datetime.apply(lambda x: x[11:13]).astype(int)
    
    def get_season(m):
        if m in ['January', 'February', 'December']:
            return 'Winter'
        elif m in [ 'March', 'April', 'May']:
            return 'Spring'
        elif m in ['June', 'July','August']:
            return 'Summer'
        else:
            return 'Fall'
        
    df['real_seasons'] = df.month.apply(lambda x: get_season(x))
    
    # Change ""feels like"" temperature to deviation from the mean of 24, which is a comfortable temperature. 
    median_temp = df.atemp.median()
    df['temp_dev'] = df.atemp.apply(lambda x: x - median_temp)
    
    # Create a date object and use it to extract day of week. 
    df['date'] = df.datetime.apply(lambda x: dt.strptime(x, ""%Y-%m-%d %H:%M:%S"").date())
    weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    df['day_of_week'] = df.date.apply(lambda x: weekdays[x.weekday()])
    df['weekend'] = df.day_of_week.apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
    
    df = df.drop(['date', 'datetime'], axis=1)
    
    print(df.columns)
    
    return df'",Yes,4,8.0
df = process_features(pd.read_csv('../input/train.csv')),No,4,45.0
"#Ridership by Month

plt.figure('Daily rides by Day of Week', figsize=(10, 26))
plt.suptitle('Daily Rides by Day of Week', fontsize=20)
plt.subplot(311)
sns.boxplot(x='day_of_week', y='count', data=df)
plt.title('All Riders', fontsize=16)

plt.subplot(312)
sns.boxplot(x='day_of_week', y='casual', data=df)
plt.title('Casual Riders', fontsize=16)

plt.subplot(313)
sns.boxplot(x='day_of_week', y='registered', data=df)
plt.title('Registered Riders', fontsize=16)

plt.show()",No,5,75.0
"#Ridership by Season

plt.figure('Daily rides by Season', figsize=(10, 20))
plt.suptitle('Daily Rides by Season', fontsize=20)
plt.subplot(311)
sns.boxplot(x='real_seasons', y='count', hue='weekend', data=df)
plt.title('All Riders', fontsize=16)

plt.subplot(312)
sns.boxplot(x='real_seasons', y='casual', hue='weekend', data=df)
plt.title('Casual Riders', fontsize=16)

plt.subplot(313)
sns.boxplot(x='real_seasons', y='registered', hue='weekend', data=df)
plt.title('Registered Riders', fontsize=16)

plt.show()",No,5,33.0
"#Ridership by Season

plt.figure('Daily rides by Month', figsize=(10, 20))
plt.suptitle('Daily Rides by Month', fontsize=20)
plt.subplot(311)
sns.boxplot(x='month', y='count', hue='weekend', data=df)
plt.title('All Riders', fontsize=16)

plt.subplot(312)
sns.boxplot(x='month', y='casual', hue='weekend', data=df)
plt.title('Casual Riders', fontsize=16)

plt.subplot(313)
sns.boxplot(x='month', y='registered', hue='weekend', data=df)
plt.title('Registered Riders', fontsize=16)

plt.show()",No,5,75.0
"plt.figure('Wind by month')
sns.boxplot(x='month', y='windspeed', data=df)
plt.title('Windspeed by Month', fontsize=20)
plt.show()",No,5,75.0
df.weather.value_counts(),No,5,72.0
"plt.figure('Weather and Ridership', figsize=(10, 20))
plt.suptitle('Weather and Ridership', fontsize=20)
plt.subplot(311)
sns.boxplot(x='weather', y='count', data=df)
plt.title('All Riders', fontsize=14)
plt.subplot(312)
sns.boxplot(x='weather', y='casual', data=df)
plt.title('Casual Riders', fontsize=14)
plt.subplot(313)
sns.boxplot(x='weather', y='registered', data=df)
plt.title('Registered Riders', fontsize=14)
plt.show()",Yes,5,33.0
"def corr_heatmap(df, title):
    plt.figure('heatmap', figsize=(15,15))
    plt.suptitle(plt.title(title, fontsize=30))
    df_corr = df.corr()
    sns.heatmap(df_corr, vmax=0.6, square=True, annot=False, cmap='Blues')
    plt.yticks(rotation = 0)
    plt.xticks(rotation = 90)
    plt.show()
    
corr_heatmap(pd.get_dummies(df), 'Correlation Matrix of All Features')",No,5,80.0
"import pandas as pd

df = process_features(pd.read_csv('../input/train.csv'))
df_submit = process_features(pd.read_csv('../input/test.csv'))

def clean_weather(df):
    df.loc[df['weather'] == 4, 'weather'] = 3
    return df

df = clean_weather(df)
df_test = clean_weather(df_submit)",Yes,4,45.0
"remove_columns = ['season', 
                  #'holiday', 
                  'workingday', 
                  #'weather', 
                  #'temp', 
                  'atemp',
                  #'humidity', 
                  'windspeed', 
                  #'month',
                  'day', 
                  #'hour', 
                  #'real_seasons', 
                  #'temp_dev', 
                  #'day_of_week', 
                  'weekend'
                 ]

# Going to make this a multi-label ensemble problem and let make these three 
# predictions into features that feed into an overall model.

target_labels = ['casual', 'registered', 'count']",No,5,77.0
"# Strip unwanted features
df_train = df.drop(remove_columns, axis=1)

df_targets = df_train[target_labels]
df_train = df_train.drop(target_labels, axis=1)

df_submit = df_test.drop(remove_columns, axis=1)
print(df_train.columns)
print(df_submit.columns)

df_train = pd.get_dummies(df_train)
df_submit = pd.get_dummies(df_submit)
print(df_train.shape[1] == df_submit.shape[1])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_train = scaler.fit_transform(df_train)
df_submit = scaler.transform(df_submit)

np_train = np.array(df_train)
np_targets = np.array(df_targets)
np_submit = np.array(df_submit)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np_train, np_targets, test_size=0.15)",Yes,3,13.0
"print(X_train.shape)
print(y_train.shape)",No,5,58.0
"def rmsle(y_true,y_pred):
   assert len(y_true) == len(y_pred)
   return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5",No,5,84.0
"from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=5, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred[y_pred < 0] = 1

print('RMSLE of predicting total count: %.4f' % rmsle(y_test[:,2], y_pred[:,2]))
print('RMSLE combining casual and registered predictions: %.4f' % rmsle(y_test[:,2], np.sum(y_pred[:,0:2], axis=1)))",No,4,49.0
"b""count0 = 0\ncount1 = 0\nfor (a,b) in zip(np.sum(y_pred[:,0:2].astype(int), axis=1), y_pred[:,2].astype(int)):\n    #print(a, b)\n    if abs(a - b) == 0:\n        count0 +=1\n    if abs(a - b) <= 1:\n        count1 +=1\nprint('Exact: %d' % count0)\nprint('Within one: %d ' % count1)\nprint('Total: %d ' % y_pred.shape[0])\nprint('Sum of registered and casual rider predictions is exactly the total count \\nprediction %d%% of the time and within one 100%% of the time.' % int((count0 / y_pred.shape[0])*100))""",No,5,53.0
"import lightgbm as lgb
X_t, X_e, y_t, y_e = train_test_split(X_train, y_train[:,2], test_size=0.15)
print(y_t.shape)
print(y_e.shape)
lgb_train = lgb.Dataset(X_t, y_t)
lgb_eval = lgb.Dataset(X_e, y_e, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'l2_root',
    'num_leaves': 43,
    'max_depth': 16

}

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                verbose_eval=0,
                early_stopping_rounds=5
               )

y_pred = gbm.predict(X_test)
y_pred[y_pred < 0] = 1

rmsle(y_pred, y_test[:,2])",Yes,3,49.0
"submission = pd.read_csv('../input/sampleSubmission.csv')
submission['count'] = np.array(rf.predict(df_submit))[:,2]
print(submission.head())
submission.to_csv('submission.csv', index=False)",No,3,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np# linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import matplotlib as plt
import os
#print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,22.0
"df=pd.read_csv(""../input/train.csv"")
df.head()
print (df.shape)
testdf=pd.read_csv(""../input/test.csv"")
print (testdf.shape)
testdf.head()",No,4,45.0
"def missingvalues(df):
    miss=df.isnull().sum()
    misspercent=100*df.isnull().sum()/len(df)
    misvaltable=pd.concat([miss,misspercent],axis=1)
    misvaltable=misvaltable.rename(columns={0:""missing values"",1:""missing percent""})
    return misvaltable
df.dtypes.value_counts()
    ",No,4,37.0
"categoryweather=df.groupby(""holiday"").nunique()
print(categoryweather)
df1=pd.get_dummies(df['weather'])
df1.head()'",No,3,20.0
"import matplotlib.pyplot as plt
df.head()",No,4,41.0
"plt.figure(figsize=(20,20))
plt.subplot(4,2,1)
plt.hist(df[""season""])
plt.xlabel(""season"")
plt.ylabel(""count"")
plt.subplot(4,2,2)
plt.hist(df[""holiday""])
plt.xlabel(""holiday"")
plt.ylabel(""count"")
plt.subplot(4,2,3)
plt.hist(df[""workingday""])
plt.xlabel(""workingday"")
plt.ylabel(""count"")
plt.subplot(4,2,4)
plt.hist(df[""weather""])
plt.xlabel(""weather"")
plt.ylabel(""count"")
plt.subplot(4,2,5)
plt.hist(df[""temp""])
plt.xlabel(""temp"")
plt.ylabel(""count"")
plt.subplot(4,2,6)
plt.hist(df[""atemp""])
plt.xlabel(""atemp"")
plt.ylabel(""count"")
plt.subplot(4,2,7)
plt.hist(df[""humidity""])
plt.xlabel(""humidity"")
plt.ylabel(""count"")
plt.subplot(4,2,8)
plt.hist(df[""windspeed""])
plt.xlabel(""windspeed"")
plt.ylabel(""count"")
plt.show()",No,5,33.0
testdf.columns,No,5,71.0
"l=[ 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed']
dftarget=df[[""casual"",""registered"",""count""]].copy()
dftarget.head()
'",No,4,21.0
"dfnew=df.copy()
dfnew.drop([""registered"",""casual"",""count""],axis=1,inplace=True)
dfnew.head()",No,4,10.0
"import matplotlib.pyplot as plt
df2=pd.concat([dfnew,testdf])
print(df2.shape)
print(df.shape)",No,4,11.0
"import matplotlib.pyplot as plt1
plt1.figure(figsize=(20,20))
plt1.subplot(4,2,1)
plt1.hist(df2[""season""])
plt1.xlabel(""season"")
plt1.ylabel(""count"")
plt1.subplot(4,2,2)
plt1.hist(df2[""holiday""])
plt1.xlabel(""holiday"")
plt1.ylabel(""count"")
plt1.subplot(4,2,3)
plt1.hist(df2[""workingday""])
plt1.xlabel(""workingday"")
plt1.ylabel(""count"")
plt1.subplot(4,2,4)
plt1.hist(df2[""weather""])
plt1.xlabel(""weather"")
plt1.ylabel(""count"")
plt1.subplot(4,2,5)
plt1.hist(df2[""temp""])
plt1.xlabel(""temp"")
plt1.ylabel(""count"")
plt1.subplot(4,2,6)
plt1.hist(df2[""atemp""])
plt1.xlabel(""atemp"")
plt1.ylabel(""count"")
plt1.subplot(4,2,7)
plt1.hist(df2[""humidity""])
plt1.xlabel(""humidity"")
plt1.ylabel(""count"")
plt1.subplot(4,2,8)
plt1.hist(df2[""windspeed""])
plt1.xlabel(""windspeed"")
plt1.ylabel(""count"")
plt1.show()",No,5,33.0
"df=pd.read_csv(""../input/train.csv"")
df1=pd.get_dummies(df['weather'])
df1=df1.rename(columns={1:""clear"",2:""misty"",3:""snow"",4:""heavy snow""})
df=df.drop([""weather""],axis=1)
df=pd.concat([df,df1],axis=1)
df.head()'",Yes,4,45.0
"df1=pd.get_dummies(df2[""weather""])
df2.drop([""weather""],axis=1,inplace=True)
df2=pd.concat([df2,df1],axis=1)
df2.head()
",Yes,4,10.0
"#df2=df2.drop([""weather""],axis=1)
df2=df2.rename(columns={1:""clear"",2:""misty"",3:""snow"",4:""heavy snow""})
df2.head()",No,4,61.0
"df1=pd.get_dummies(df2[""season""])
#df2=df2.drop([""season""],axis=1,inplace=True)
#df1.head()
df3=pd.concat([df2,df1],axis=1)
#df2.head()
df3=df3.rename(columns={1:""spring"",2:""summer"",3:""fall"",4:""winter""})
df3.head()",Yes,4,11.0
df3.shape,No,5,58.0
"df.shape
df0=pd.get_dummies(df[""season""])
df4=pd.concat([df,df0],axis=1)
df4=df4.rename(columns={1:""spring"",2:""summer"",3:""fall"",4:""winter""})
df4.head()",Yes,4,11.0
"df4.groupby(""spring"").describe()",No,5,40.0
"df4.groupby(""fall"")[""registered""].describe()",No,5,40.0
"df4[""weekend""]=[abs(1-abs(x-y)) for x,y in zip(df4[""workingday""],df4[""holiday""]) ]
df4.head()",No,4,41.0
"print(df4.groupby(""weekend"")[""datetime""].nunique())
print(df4.groupby(""holiday"")[""datetime""].nunique())
print(df4.groupby(""workingday"")[""datetime""].nunique())",No,5,54.0
"df4.groupby(""weekend"")['registered',""count"",""casual""].describe()'",No,5,40.0
"df3[""weekend""]=[abs(1-abs(x-y)) for x,y in zip(df3[""workingday""],df3[""holiday""]) ]",No,5,8.0
df3.head(),No,5,41.0
df4.head(25),No,5,41.0
"df4[""time""]=pd.to_datetime(df4[""datetime""])
df4.head()",No,4,16.0
"df4[""hours""]=df4[""time""].dt.hour
df4.head()",No,5,8.0
"df3[""time""]=pd.to_datetime(df3[""datetime""])
df3.head()",No,5,16.0
"df3[""hours""]=df3[""time""].dt.hour
df3.head()",No,4,8.0
"df4.drop([""time""],axis=1,inplace=True)
df3.drop([""time""],axis=1,inplace=True)",No,5,10.0
df4.head(),No,5,41.0
"import seaborn as sns
plt.figure(figsize=(20,20))
ax=plt.subplot(221)
sns.boxplot(data=df4,x=""hours"",y=""registered"",ax=ax)
ax=plt.subplot(222)
sns.boxplot(data=df4,x=""hours"",y=""casual"",ax=ax)
ax=plt.subplot(223)
sns.boxplot(data=df4,x=""hours"",y=""count"",ax=ax)",No,5,75.0
"df4[""logcasual""]=np.log(df4[""casual""]+1)
df4[""logcasual""]=np.log(df4[""casual""]+1)
df4[""logcasual""]=np.log(df4[""casual""]+1)
df4.head()",No,5,8.0
"df4[""logregistered""]=np.log(df4[""registered""]+1)
df4[""logcount""]=np.log(df4[""count""]+1)
df4.head()",No,4,8.0
"#inspecting hourly trend
import seaborn as sns
plt.figure(figsize=(20,20))
ax=plt.subplot(221)
sns.boxplot(data=df4,x=""hours"",y=""logregistered"",ax=ax)
ax=plt.subplot(222)
sns.boxplot(data=df4,x=""hours"",y=""logcasual"",ax=ax)
ax=plt.subplot(223)
sns.boxplot(data=df4,x=""hours"",y=""logcount"",ax=ax)            ",No,5,75.0
"df4[""day""]=df4[""time""].dt.day
df4.head(25)",No,5,8.0
"df4[""day""]=df4[""time""].dt.dayofweek
df4.head()",No,5,8.0
"df4[""day""]=df4[""time""].dt.dayofweek
df4.head(25)",No,4,8.0
"#inspecting daily trend
import seaborn as sns
plt.figure(figsize=(20,20))
ax=plt.subplot(221)
sns.boxplot(data=df4,x=""day"",y=""logregistered"",ax=ax)
ax=plt.subplot(222)
sns.boxplot(data=df4,x=""day"",y=""logcasual"",ax=ax)
ax=plt.subplot(223)
sns.boxplot(data=df4,x=""day"",y=""logcount"",ax=ax) ",No,5,75.0
"import seaborn as sns
plt.figure(figsize=(20,20))
ax=plt.subplot(221)
sns.boxplot(data=df4,x=""day"",y=""registered"",ax=ax)
ax=plt.subplot(222)
sns.boxplot(data=df4,x=""day"",y=""casual"",ax=ax)
ax=plt.subplot(223)
sns.boxplot(data=df4,x=""day"",y=""count"",ax=ax) ",No,5,75.0
"df5=df.copy()
df5.drop([""holiday"",""workingday"",""season""],axis=1,inplace=True)
df5.corr()",No,4,10.0
"df4[""year""]=df4['time'].dt.year
df4.head()'",No,4,8.0
"plt.figure(figsize=(20,20))
ax=plt.subplot(2,2,1)
sns.boxplot(data=df4,x=""year"",y='registered',ax=ax)
#plt.figure(figsize=(20,20))
ax=plt.subplot(2,2,2)
sns.boxplot(data=df4,x=""year"",y='casual',ax=ax)
#plt.figure(figsize=(20,20))
ax=plt.subplot(2,2,3)
sns.boxplot(data=df4,x=""year"",y='count',ax=ax)
'",No,5,75.0
"plt.figure(figsize=(10,10))
sns.boxplot(data=df4,x=""weekend"",y=""casual"")",No,5,75.0
"df4.groupby(""weekend"")[""datetime""].nunique()",No,5,54.0
"
df4[""month""]=df4[""time""].dt.month
df4.head()",No,4,8.0
"#df4.drop([""year_bins""],axis=1,inplace=True)
df4[""year_bin""]=""y0""
df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]<=3)]=""y1""
df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>3) & (df4[""month""]<=6)]=""y2""
df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>6) & (df4[""month""]<=9)]=""y3""
df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>9) & (df4[""month""]<=12)]=""y4""
df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]<=3)]=""y5""
df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>3) & (df4[""month""]<=6)]=""y6""
df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>6) & (df4[""month""]<=9)]=""y7""
df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>9) & (df4[""month""]<=12)]=""y8""
df4.groupby('year_bin')[""datetime""].nunique()'",No,4,20.0
"plt.figure(figsize=(20,20))
ax=plt.subplot(221)
sns.boxplot(data=df4,x=""year_bin"",y=""casual"",ax=ax)
ax=plt.subplot(222)
sns.boxplot(data=df4,x=""year_bin"",y=""registered"",ax=ax)
ax=plt.subplot(223)
sns.boxplot(data=df4,x=""year_bin"",y=""count"",ax=ax)",No,5,33.0
"from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree",No,5,22.0
"from sklearn.tree import DecisionTreeRegressor
l=[""hours""]
X=df4[l]
Y=df4[""casual""]
dtree=DecisionTreeRegressor(max_depth=3)
dtree.fit(X,Y)",No,5,7.0
"from sklearn.externals.six import StringIO  
from IPython.display import Image 
from sklearn.tree import export_graphviz
import graphviz
data = export_graphviz(dtree,out_file=None,   
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(data)
graph",Yes,4,22.0
"df4[""casual""].describe()",No,5,40.0
"l=[""hours""]
X=df4[l]
Y=df4[""registered""]
dtree1=DecisionTreeRegressor(max_depth=4)
dtree1.fit(X,Y)",No,5,7.0
"data = export_graphviz(dtree1,out_file=None,   
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(data)
graph",No,5,84.0
"df4[""daycas""]=""cas0""
df4[""daycas""].loc[df4[""hours""]<=6.5]=""cas1""
df4[""daycas""].loc[(df4[""hours""]>6.5) & (df4[""hours""]<=7.5)]=""cas2""
df4[""daycas""].loc[(df4[""hours""]>7.5) & (df4[""hours""]<=8.5)]=""cas3""
df4[""daycas""].loc[(df4[""hours""]>8.5) & (df4[""hours""]<=9.5)]=""cas4""
df4[""daycas""].loc[(df4[""hours""]>9.5) & (df4[""hours""]<=10.5)]=""cas5""
df4[""daycas""].loc[(df4[""hours""]>10.5) & (df4[""hours""]<=19.5)]=""cas6""
df4[""daycas""].loc[(df4[""hours""]>19.5) & (df4[""hours""]<=21.5)]=""cas7""
df4[""daycas""].loc[df4[""hours""]>21.5]=""cas8""",No,5,20.0
"df4.groupby(""daycas"")[""datetime""].nunique()",No,5,54.0
"df4[""dayreg""]=""reg0""
df4[""dayreg""].loc[df4[""hours""]<=0.5]=""reg1""
df4[""dayreg""].loc[(df4[""hours""]>0.5) & (df4[""hours""]<=1.5)]=""reg2""
df4[""dayreg""].loc[(df4[""hours""]>1.5) & (df4[""hours""]<=4.5)]=""reg3""
df4[""dayreg""].loc[(df4[""hours""]>4.5) & (df4[""hours""]<=5.5)]=""reg4""
df4[""dayreg""].loc[(df4[""hours""]>5.5) & (df4[""hours""]<=6.5)]=""reg5""
df4[""dayreg""].loc[(df4[""hours""]>6.5) & (df4[""hours""]<=8.5)]=""reg6""
df4[""dayreg""].loc[(df4[""hours""]>8.5) & (df4[""hours""]<=16.5)]=""reg7""
df4[""dayreg""].loc[(df4[""hours""]>16.5) & (df4[""hours""]<=18.5)]=""reg8""
df4[""dayreg""].loc[(df4[""hours""]>18.5) & (df4[""hours""]<=20.5)]=""reg9""
df4[""dayreg""].loc[(df4[""hours""]>20.5) & (df4[""hours""]<=21.5)]=""reg10""
df4[""dayreg""].loc[(df4[""hours""]>21.5) & (df4[""hours""]<=22.5)]=""reg11""
df4[""dayreg""].loc[df4[""hours""]>22.5]=""reg12""
df4.groupby(""dayreg"")[""datetime""].nunique()",No,4,20.0
"df4.head()
",No,5,41.0
"df3[""time""]=pd.to_datetime(df3[""datetime""])
df3[""year""]=df3['time'].dt.year
df3[""month""]=df3[""time""].dt.month
df3[""day""]=df3[""time""].dt.dayofweek
df3.head()'",No,4,8.0
"df3[""year_bin""]=""y0""
df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]<=3)]=""y1""
df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>3) & (df3[""month""]<=6)]=""y2""
df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>6) & (df3[""month""]<=9)]=""y3""
df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>9) & (df3[""month""]<=12)]=""y4""
df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]<=3)]=""y5""
df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>3) & (df3[""month""]<=6)]=""y6""
df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>6) & (df3[""month""]<=9)]=""y7""
df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>9) & (df3[""month""]<=12)]=""y8""
df3[""daycas""]=""cas0""
df3[""daycas""].loc[df3[""hours""]<=6.5]=""cas1""
df3[""daycas""].loc[(df3[""hours""]>6.5) & (df3[""hours""]<=7.5)]=""cas2""
df3[""daycas""].loc[(df3[""hours""]>7.5) & (df3[""hours""]<=8.5)]=""cas3""
df3[""daycas""].loc[(df3[""hours""]>8.5) & (df3[""hours""]<=9.5)]=""cas4""
df3[""daycas""].loc[(df3[""hours""]>9.5) & (df3[""hours""]<=10.5)]=""cas5""
df3[""daycas""].loc[(df3[""hours""]>10.5) & (df3[""hours""]<=19.5)]=""cas6""
df3[""daycas""].loc[(df3[""hours""]>19.5) & (df3[""hours""]<=21.5)]=""cas7""
df3[""daycas""].loc[df3[""hours""]>21.5]=""cas8""
df3[""dayreg""]=""reg0""
df3[""dayreg""].loc[df3[""hours""]<=0.5]=""reg1""
df3[""dayreg""].loc[(df3[""hours""]>0.5) & (df3[""hours""]<=1.5)]=""reg2""
df3[""dayreg""].loc[(df3[""hours""]>1.5) & (df3[""hours""]<=4.5)]=""reg3""
df3[""dayreg""].loc[(df3[""hours""]>4.5) & (df3[""hours""]<=5.5)]=""reg4""
df3[""dayreg""].loc[(df3[""hours""]>5.5) & (df3[""hours""]<=6.5)]=""reg5""
df3[""dayreg""].loc[(df3[""hours""]>6.5) & (df3[""hours""]<=8.5)]=""reg6""
df3[""dayreg""].loc[(df3[""hours""]>8.5) & (df3[""hours""]<=16.5)]=""reg7""
df3[""dayreg""].loc[(df3[""hours""]>16.5) & (df3[""hours""]<=18.5)]=""reg8""
df3[""dayreg""].loc[(df3[""hours""]>18.5) & (df3[""hours""]<=20.5)]=""reg9""
df3[""dayreg""].loc[(df3[""hours""]>20.5) & (df3[""hours""]<=21.5)]=""reg10""
df3[""dayreg""].loc[(df3[""hours""]>21.5) & (df3[""hours""]<=22.5)]=""reg11""
df3[""dayreg""].loc[df3[""hours""]>22.5]=""reg12""
df3.head()",No,5,20.0
"df6=df4.copy()
df6.drop([""datetime"",""season"",""time"",""count"",""registered"",""casual"",""logcount""],axis=1,inplace=True)
df6.head()",No,4,10.0
"df4[""rtemp1""]=df4[""temp""]+df4[""atemp""]
df4[""rtemp2""]=df4[""temp""]-df4[""atemp""]
df4[""rtemp3""]=df4[""temp""]*df4[""atemp""]
print(df4[""temp""].corr(df4[""registered""]))
print(df4[""rtemp1""].corr(df4[""registered""]))
print(df4[""rtemp2""].corr(df4[""registered""]))
print(df4[""rtemp3""].corr(df4[""registered""]))
df3[""rtemp3""]=df3[""temp""]*df3[""atemp""]",No,5,8.0
"df3.drop([""datetime"",""season"",""time""],axis=1,inplace=True)",No,5,10.0
"from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error",No,5,22.0
"df6.head()
",No,5,41.0
"df7=df6.logcasual
df8=df6.logregistered",No,5,8.0
"df6.drop([""logcasual"",'logregistered'],axis=1,inplace=True)
'",No,5,10.0
"df6.drop([""atemp""],axis=1,inplace=True)",No,5,10.0
"df6.drop([""temp""],axis=1,inplace=True)",No,5,10.0
"df6.drop([""rtemp3"",""month""],axis=1,inplace=True)
d=pd.read_csv(""../input/train.csv"")
df6[""temp""]=d[""temp""]
var=[""holiday"",""workingday"",""weekend"",""hours""]
for v in var:
    df6[v]=df6[v].astype(""category"")
",No,3,10.0
"from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(dfreg,df7,random_state=42)
rf=RandomForestRegressor(n_estimators=500)
rf.fit(x_train,y_train)
predictions=rf.predict(x_test)
mean_squared_error(y_test, predictions)",Yes,4,20.0
"rf1=RandomForestRegressor(n_estimators=500)
rf1.fit(dfcas,df7)
rf2=RandomForestRegressor(n_estimators=500)
rf2.fit(dfreg,df8)",Yes,4,49.0
df6.head(),No,5,41.0
"df3.drop([""temp"",""atemp""],axis=1,inplace=True)",No,5,10.0
"print(df3.shape)
print(df6.shape)",No,5,58.0
"newtest=df3.tail(17379-10886)
newtest.head()",No,5,41.0
testdf.head(),No,5,41.0
newtest.head(),No,5,41.0
"print(newtest.shape)
print(testdf.shape)",No,5,58.0
"newtest.drop([""rtemp3"",""month""],axis=1,inplace=True)

f=pd.read_csv(""../input/test.csv"")
newtest['year']=newtest.year.replace({2011:0,2012:1})
newtest[""temp""]=f[""temp""]
var=[""holiday"",""workingday"",""weekend"",""hours""]
for v in var:
    newtest[v]=newtest[v].astype(""category"")
newtestreg=newtest.copy()
newtestcas=newtest.copy()
newtestreg.drop([""daycas""],axis=1,inplace=True)
newreg1=pd.get_dummies(newtestreg[""dayreg""])
newtestreg=pd.concat([newtestreg,newreg1],axis=1)
newreg2=pd.get_dummies(newtestreg[""year_bin""])
newtestreg=pd.concat([newtestreg,newreg2],axis=1)
newreg3=pd.get_dummies(newtestreg[""day""])
newtestreg=pd.concat([newtestreg,newreg3],axis=1)
newtestreg.drop([""dayreg"",""year_bin"",""day""],axis=1,inplace=True)
newtestcas.drop([""dayreg""],axis=1,inplace=True)
newcas1=pd.get_dummies(newtestcas[""daycas""])
newtestcas=pd.concat([newtestcas,newcas1],axis=1)
newcas2=pd.get_dummies(newtestcas[""year_bin""])
newtestcas=pd.concat([newtestcas,newcas2],axis=1)
newcas3=pd.get_dummies(newtestcas[""day""])
newtestcas=pd.concat([newtestcas,newcas3],axis=1)
newtestcas.drop([""daycas"",""year_bin"",""day""],axis=1,inplace=True)
newtest.head()

'",No,2,58.0
"predictcas=rf1.predict(newtestcas)
predictcas=np.exp(predictcas)-1
predictreg=rf2.predict(newtestreg)
predictreg=np.exp(predictreg)-1
",Yes,4,20.0
print(type(predictcas)),No,5,70.0
"dfcas.head()
",No,5,41.0
import os ,No,5,22.0
"df = pd.read_csv('../input/train.csv', parse_dates=[0])",No,5,45.0
"test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0
df_all['hour'] = df['datetime'].dt.hour,No,5,8.0
import numpy as np,No,5,22.0
"df_all['count'] = np.log(df_all['count'] + 1)
df_all['registered'] = np.log(df_all['registered'] + 1)
df_all['casual'] = np.log(df_all['casual'] + 1)",No,5,8.0
"df_all.shape, df.shape, test.shape",No,5,58.0
df_all.shape,No,5,58.0
"from fastai.imports import *
from fastai.structured import *",No,5,22.0
df_all.info(),No,5,40.0
"df = df_all[~df_all['count'].isnull()]
test = df_all[df_all['count'].isnull()]",No,5,13.0
"df.shape, test.shape",No,5,58.0
train = df[df['datetimeDay'] <= 15],No,5,14.0
valid = df[df['datetimeDay'] > 15],No,5,14.0
"train.shape, valid.shape",No,5,58.0
"feats = [c for c in df.columns if c not in ['casual', 'registered', 'count']]",No,5,77.0
"feats = ['season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'datetimeDayofweek',
        'hour',
        'datetimeYear']",No,5,77.0
"rf.fit(train[feats], train['count'])",No,5,7.0
rf.predict(valid[feats]),No,5,48.0
from sklearn.metrics import mean_squared_error,No,5,22.0
"mean_squared_error(valid['count'], rf.predict(valid[feats])) ** (1/2)",No,5,49.0
"pd.Series(rf.feature_importances_, index=feats).sort_values().plot.barh()",No,5,79.0
real_test_predictions.min(),No,2,40.0
"submission = pd.DataFrame({
        ""datetime"": real_test_data_ids,
        ""count"": [max(0, x) for x in real_test_predictions]
    })
submission.head()",Yes,3,41.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

RANDOM_STATE = 31415",No,5,77.0
"cols = df_train.columns.tolist()
cols",No,5,71.0
"from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(df_train)",Yes,4,7.0
"x = df_train.drop(['Cover_Type'],axis = 1)
y = df_train['Cover_Type']",No,5,21.0
"x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)",No,5,13.0
"xgb = XGBClassifier()
xgb.fit(x_train,y_train)",No,5,7.0
"predics =xgb.predict(x_test)
predics
",No,5,48.0
"accuracy_score(y_test,predics)",No,5,49.0
"df_test['Cover_Type'] = xgb.predict(df_test)
df_test['Cover_Type']",No,5,48.0
"my_submission = pd.DataFrame({'Id':test_id,'Cover_Type': df_test['Cover_Type']})
my_submission.to_csv('submission.csv', index=False)",No,5,25.0
my_submission.to_csv(r'my_submission.csv'),No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pandas.tseries.frequencies import to_offset #Set the frequency in the index

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,22.0
"#Import the files and set the index
X_train = pd.read_csv('../input/train.csv')
X_test = pd.read_csv('../input/test.csv')
X_train = return_set_index(X_train)
X_test = return_set_index(X_test)
",No,5,45.0
"#We will devide the train dataset in seasons and working days vs non working days. The idea is to group the results by hour and observe if the ratio 
#mean standard deviation for each group is better or not for the overall group.
X_workingday_1 = X_train[(X_train.workingday == 1) & (X_train.season == 1)]
X_holiday_1    = X_train[(X_train.workingday == 0) & (X_train.season == 1)]
X_total_1      = X_train[(X_train.season == 1)]

X_workingday_2 = X_train[(X_train.workingday == 1) & (X_train.season == 2)]
X_holiday_2    = X_train[(X_train.workingday == 0) & (X_train.season == 2)]
X_total_2      = X_train[(X_train.season == 2)]

X_workingday_3 = X_train[(X_train.workingday == 1) & (X_train.season == 3)]
X_holiday_3    = X_train[(X_train.workingday == 0) & (X_train.season == 3)]
X_total_3      = X_train[(X_train.season == 3)]


X_workingday_4 = X_train[(X_train.workingday == 1) & (X_train.season == 4)]
X_holiday_4    = X_train[(X_train.workingday == 0) & (X_train.season == 4)]
X_total_4      = X_train[(X_train.season == 4)]",No,5,14.0
"X_final.reset_index(level=0, inplace=True)
X_final.to_csv('result.csv', index=False)",No,5,25.0
"import calendar
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from scipy import stats

%matplotlib inline
sns.set()",No,5,23.0
"drop_lst = ['casual', 'registered']
df = df.drop(drop_lst, axis=1)
df.head()",No,4,10.0
df['count'].head(),No,5,41.0
plt.hist(df['count']);,No,5,33.0
"count_log = np.log(df['count'])
plt.hist(count_log);",No,5,33.0
plt.hist(count_boxcox);,No,5,33.0
"df['count_log'] = count_log
df['count_boxcox'] = count_boxcox",No,5,8.0
df['datetime'] = pd.to_datetime(df['datetime']),No,5,16.0
"df['dow'] = df['datetime'].dt.dayofweek
df.head()",No,5,8.0
"df['month'] = df['datetime'].dt.month
df.head()",No,4,8.0
"df['week'] = df['datetime'].dt.week
df.head()",No,5,8.0
"df['hour'] = df['datetime'].dt.hour
df.head()",No,4,8.0
"df['year'] = df['datetime'].dt.year
df.head()",No,4,8.0
"df['day'] = df['datetime'].dt.day
df.head()",No,4,8.0
"df = df.drop(labels='datetime', axis=1)
df.head()",No,4,10.0
"fig, ax = plt.subplots(1, 2, figsize=(12, 4))

names = ['1', '2', '3', '4']

values = df['season'][df['year'] == 2011].value_counts()
ax[0].bar(names, values)

values = df['season'][df['year'] == 2012].value_counts()
ax[1].bar(names, values)

fig.suptitle('Seasons in 2011 & 2012');",No,5,33.0
"spring_2011 = int(df['season'][df['season'] == 1][df['year'] == 2011].value_counts())
summer_2011 = int(df['season'][df['season'] == 2][df['year'] == 2011].value_counts())
fall_2011 = int(df['season'][df['season'] == 3][df['year'] == 2011].value_counts())
winter_2011 = int(df['season'][df['season'] == 4][df['year'] == 2011].value_counts())

spring_2012 = int(df['season'][df['season'] == 1][df['year'] == 2012].value_counts())
summer_2012 = int(df['season'][df['season'] == 2][df['year'] == 2012].value_counts())
fall_2012 = int(df['season'][df['season'] == 3][df['year'] == 2012].value_counts())
winter_2012 =int(df['season'][df['season'] == 4][df['year'] == 2012].value_counts())

print(""Spring 2011: {}"".format(spring_2011))
print(""Summer 2011: {}"".format(summer_2011))
print(""Fall 2011: {}"".format(fall_2011))
print(""Winter 2011: {}"".format(winter_2011))
print(""-----------------------------------------"")
print(""Spring 2012: {}"".format(spring_2012))
print(""Summer 2012: {}"".format(summer_2012))
print(""Fall 2012: {}"".format(fall_2012))
print(""Winter 2012: {}"".format(winter_2012))'",No,5,72.0
"fig, ax = plt.subplots(1, 2, figsize=(12, 4))

names = ['0', '1']

values = df['holiday'][df['year'] == 2011].value_counts()
ax[0].bar(names, values)

values = df['holiday'][df['year'] == 2012].value_counts()
ax[1].bar(names, values)

fig.suptitle('Holidays in 2011 & 2012');",No,5,33.0
"# metric to optimize
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

scorer = make_scorer(lambda y_test, predictions: np.sqrt(mean_squared_error(y_test, predictions)))",Yes,5,84.0
"no_holiday_2011 = int(df['holiday'][df['holiday'] == 0][df['year'] == 2011].value_counts())
holiday_2011 = int(df['holiday'][df['holiday'] == 1][df['year'] == 2011].value_counts())
no_holiday_2012 = int(df['holiday'][df['holiday'] == 0][df['year'] == 2012].value_counts())
holiday_2012 = int(df['holiday'][df['holiday'] == 1][df['year'] == 2012].value_counts())

print(""No Holidays 2011: {}"".format(no_holiday_2011))
print(""No Holidays 2012: {}"".format(no_holiday_2012))
print(""Holidays 2011: {}"".format(holiday_2011))
print(""Holidays 2012: {}"".format(holiday_2012))
print('----------------')
total_2011 = no_holiday_2011 + holiday_2011
total_2012 = no_holiday_2012 + holiday_2012
print('No Holidays 2011: {:.0f}%'.format(no_holiday_2011 / total_2011 * 100))
print('No Holidays 2012: {:.0f}%'.format(no_holiday_2012 / total_2012 * 100))'",No,5,72.0
training_set = pd.read_csv('../input/train.csv'),No,5,45.0
training_set.head(),No,5,41.0
"fig, ax = plt.subplots(1, 2, figsize=(12, 4))

names = ['0', '1']

values = df['workingday'][df['year'] == 2011].value_counts()
ax[0].bar(names, values)

values = df['workingday'][df['year'] == 2012].value_counts()
ax[1].bar(names, values)

fig.suptitle('Working day in 2011 & 2012');",No,4,33.0
"no_workingday_2011 = int(df['workingday'][df['workingday'] == 0][df['year'] == 2011].value_counts())
workingday_2011 = int(df['workingday'][df['workingday'] == 1][df['year'] == 2011].value_counts())
no_workingday_2012 = int(df['workingday'][df['workingday'] == 0][df['year'] == 2012].value_counts())
workingday_2012 = int(df['workingday'][df['workingday'] == 1][df['year'] == 2012].value_counts())

print(""No working day 2011: {}"".format(no_workingday_2011))
print(""working day 2011: {}"".format(workingday_2011))
print(""No working day 2012: {}"".format(no_workingday_2012))
print(""working day 2012: {}"".format(workingday_2012))
print('----------------')
total_2011 = no_workingday_2011 + workingday_2011
total_2012 = no_workingday_2012 + workingday_2012
print('No working day 2011: {:.0f}%'.format(no_workingday_2011 / total_2011 * 100))
print('No working day 2012: {:.0f}%'.format(no_workingday_2012 / total_2012 * 100))'",No,5,72.0
"fig, ax = plt.subplots(1, 2, figsize=(12, 4))

names_2011 = ['1', '2', '3']
names_2012 = ['1', '2', '3', '4']

values = df['weather'][df['year'] == 2011].value_counts()
ax[0].bar(names_2011, values)

values = df['weather'][df['year'] == 2012].value_counts()
ax[1].bar(names_2012, values)

fig.suptitle('Weather in 2011 & 2012');",No,5,33.0
"weather_2011_1 = df['weather'][df['weather'] == 1][df['year'] == 2011].value_counts()
weather_2011_2 = df['weather'][df['weather'] == 2][df['year'] == 2011].value_counts()
weather_2011_3 = df['weather'][df['weather'] == 3][df['year'] == 2011].value_counts()

weather_2012_1 = df['weather'][df['weather'] == 1][df['year'] == 2012].value_counts()
weather_2012_2 = df['weather'][df['weather'] == 2][df['year'] == 2012].value_counts()
weather_2012_3 = df['weather'][df['weather'] == 3][df['year'] == 2012].value_counts()
weather_2012_4 = df['weather'][df['weather'] == 4][df['year'] == 2012].value_counts()

print('weather_1 in 2011: {}'.format(int(weather_2011_1)))
print('weather_2 in 2011: {}'.format(int(weather_2011_2)))
print('weather_3 in 2011: {}'.format(int(weather_2011_3)))
print('--------------')
print('weather_1 in 2012: {}'.format(int(weather_2012_1)))
print('weather_2 in 2012: {}'.format(int(weather_2012_2)))
print('weather_3 in 2012: {}'.format(int(weather_2012_3)))
print('weather_4 in 2012: {}'.format(int(weather_2012_4)))
print('---------------')
total_2011 = int(weather_2011_1) + int(weather_2011_2) + int(weather_2011_3)
total_2012 = int(weather_2012_1) + int(weather_2012_2) + int(weather_2012_3) + int(weather_2012_4)
print('weather_1 in 2011: {:.0f}%'.format(int(weather_2011_1) / int(total_2011) * 100))
print('weather_2 in 2011: {:.0f}%'.format(int(weather_2011_2) / int(total_2011) * 100))
print('weather_3 in 2011: {:.0f}%'.format(int(weather_2011_3) / int(total_2011) * 100))
print('--------------')
print('weather_1 in 2012: {:.0f}%'.format(int(weather_2012_1) / int(total_2012) * 100))
print('weather_2 in 2012: {:.0f}%'.format(int(weather_2012_2) / int(total_2012) * 100))
print('weather_3 in 2012: {:.0f}%'.format(int(weather_2012_3) / int(total_2012) * 100))
print('weather_4 in 2012: {:.0f}%'.format(int(weather_2012_4) / int(total_2012) * 100))",No,5,72.0
"plt.hist(df['temp'][df['year'] == 2011], alpha=0.5, label='2011')
plt.hist(df['temp'][df['year'] == 2012], alpha=0.5, label='2012')

plt.legend(loc='upper right');",No,5,33.0
"training_set.plot(x = 'datetime', y = 'casual')",No,5,75.0
"training_set.plot(x = 'datetime', y = 'registered')",No,5,75.0
"plt.hist(df['atemp'][df['year'] == 2011], alpha=0.5, label='2011')
plt.hist(df['atemp'][df['year'] == 2012], alpha=0.5, label='2012')

plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['humidity'][df['year'] == 2011], alpha=0.5, label='2011')
plt.hist(df['humidity'][df['year'] == 2012], alpha=0.5, label='2012')

plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['windspeed'][df['year'] == 2011], alpha=0.5, label='2011')
plt.hist(df['windspeed'][df['year'] == 2012], alpha=0.5, label='2012')

plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['dow'][df['year'] == 2011], alpha=0.5, label='2011', bins=7)
plt.hist(df['dow'][df['year'] == 2012], alpha=0.5, label='2012', bins=7)

plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['month'][df['year'] == 2011], alpha=0.5, label='2011', bins=12)
plt.hist(df['month'][df['year'] == 2012], alpha=0.5, label='2012', bins=12)

plt.legend(loc='upper right');",No,5,75.0
"plt.hist(df['week'][df['year'] == 2011], alpha=0.5, label='2011', bins=52)
plt.hist(df['week'][df['year'] == 2012], alpha=0.5, label='2012', bins=52)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['hour'][df['year'] == 2011], alpha=0.5, label='2011', bins=24)
plt.hist(df['hour'][df['year'] == 2012], alpha=0.5, label='2012', bins=24)
plt.legend(loc='upper right');",No,5,33.0
"plt.hist(df['day'][df['year'] == 2011], alpha=0.5, label='2011', bins=31)
plt.hist(df['day'][df['year'] == 2012], alpha=0.5, label='2012', bins=31)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.legend(loc='upper right');",No,5,75.0
"names = ['2011', '2012']
values = df['year'].value_counts()
plt.bar(names, values);",No,5,33.0
"count_2011 = df['year'][df['year'] == 2011].count()
count_2012 = df['year'][df['year'] == 2012].count()

print('2011: {}'.format(count_2011))
print('2012: {}'.format(count_2012))",No,5,72.0
"cor_mat = df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True);",No,5,80.0
"corr = training_set.corr()
fig, ax = plt.subplots(figsize=(30, 30))
ax.matshow(corr)

for (i, j), z in np.ndenumerate(corr):
    ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))

plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);",No,5,80.0
"from sklearn.model_selection import train_test_split

# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['datetime', 'atemp', 'registered', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.casual, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['casual'])
X_test = X_test.drop(columns = ['casual'])",Yes,3,13.0
"# Preprocessing for linear regression

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather
X_train_norm = one_hot.fit_transform(X_train_norm)
X_test_norm = one_hot.transform(X_test_norm)",Yes,4,20.0
from sklearn.linear_model import Lasso,No,5,22.0
"from sklearn.model_selection import cross_val_score
casual_model = Lasso()
scores = cross_val_score(casual_model, X_train_norm, y_train, cv=5, scoring = scorer)
scores",Yes,5,84.0
"casual_model.fit(X_train_norm, y_train)",No,5,7.0
"# Same thing for the second variable
# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['datetime', 'atemp', 'casual', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.registered, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['registered'])
X_test = X_test.drop(columns = ['registered'])",Yes,4,13.0
"# Preprocessing for linear regression

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather
X_train_norm = one_hot.fit_transform(X_train_norm)
X_test_norm = one_hot.transform(X_test_norm)",Yes,4,20.0
"from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
registered_model = Lasso()
scores = cross_val_score(registered_model, X_train_norm, y_train, cv=5, scoring = scorer)
scores",Yes,5,28.0
"registered_model.fit(X_train_norm, y_train)",No,5,7.0
"# Final prediction of the baseline models, as I am not going to tweak them, I will move directly to the test data

test_dataset = pd.read_csv(""../input/test.csv"")",No,5,45.0
"test_data = test_dataset.drop(columns = ['datetime', 'atemp'])
test_data = scaler.transform(test_data)
test_data = one_hot.transform(test_data)",Yes,5,20.0
"casual = casual_model.predict(test_data)
registered = registered_model.predict(test_data)
total = casual + registered",No,4,48.0
test_dataset[test_dataset['count'] < 0],No,5,14.0
"test_dataset.loc[test_dataset['count'] < 0, 'count'] = 0",No,2,8.0
test_dataset[test_dataset['count'] <= 0],No,5,14.0
"test_dataset[['datetime', 'count']].to_csv('result.csv', index = False)",No,4,25.0
"# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
% matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification",No,5,23.0
"train=pd.read_csv(r'../input/train.csv')
test=pd.read_csv(r'../input/test.csv')
df=train.copy()
test_df=test.copy()
df.head()",Yes,5,45.0
df.columns.unique(),No,5,57.0
df.isnull().sum(),No,5,39.0
msno.matrix(df),No,5,34.0
df.season.value_counts(),No,5,72.0
"sns.factorplot(x='season',data=df,kind='count',size=5,aspect=1.5)",No,5,33.0
"df.holiday.value_counts()
sns.factorplot(x='holiday',data=df,kind='count',size=5,aspect=1)",Yes,5,33.0
"df.workingday.value_counts()
sns.factorplot(x='workingday',data=df,kind='count',size=5,aspect=1)",Yes,5,33.0
"# 1-> spring
# 2-> summer
# 3-> fall
# 4-> winter
sns.factorplot(x='weather',data=df,kind='count',size=5,aspect=1)",No,5,33.0
"sns.boxplot(data=df[['temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']])
fig=plt.gcf()
fig.set_size_inches(10,10)",No,5,33.0
"df.temp.unique()
fig,axes=plt.subplots(2,2)
axes[0,0].hist(x=""temp"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[0,0].set_title(""Variation of temp"")
axes[0,1].hist(x=""atemp"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[0,1].set_title(""Variation of atemp"")
axes[1,0].hist(x=""windspeed"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[1,0].set_title(""Variation of windspeed"")
axes[1,1].hist(x=""humidity"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[1,1].set_title(""Variation of humidity"")
fig.set_size_inches(10,10)'",No,5,33.0
"cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)",No,5,80.0
"season=pd.get_dummies(df['season'],prefix='season')
df=pd.concat([df,season],axis=1)
df.head()
season=pd.get_dummies(test_df['season'],prefix='season')
test_df=pd.concat([test_df,season],axis=1)
test_df.head()",Yes,4,20.0
"weather=pd.get_dummies(df['weather'],prefix='weather')
df=pd.concat([df,weather],axis=1)
df.head()
weather=pd.get_dummies(test_df['weather'],prefix='weather')
test_df=pd.concat([test_df,weather],axis=1)
test_df.head()",Yes,4,20.0
"df.drop(['season','weather'],inplace=True,axis=1)
df.head()
test_df.drop(['season','weather'],inplace=True,axis=1)
test_df.head()",Yes,4,10.0
"df[""hour""] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
df[""day""] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
df[""month""] = [t.month for t in pd.DatetimeIndex(df.datetime)]
df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)]
df['year'] = df['year'].map({2011:0, 2012:1})
df.head()'",Yes,4,8.0
"test_df[""hour""] = [t.hour for t in pd.DatetimeIndex(test_df.datetime)]
test_df[""day""] = [t.dayofweek for t in pd.DatetimeIndex(test_df.datetime)]
test_df[""month""] = [t.month for t in pd.DatetimeIndex(test_df.datetime)]
test_df['year'] = [t.year for t in pd.DatetimeIndex(test_df.datetime)]
test_df['year'] = test_df['year'].map({2011:0, 2012:1})
test_df.head()'",Yes,5,8.0
"df.drop('datetime',axis=1,inplace=True)
df.head()",Yes,4,10.0
"df.drop(['casual','registered'],axis=1,inplace=True)
df.head()",Yes,4,10.0
"sns.factorplot(x=""hour"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,75.0
"sns.factorplot(x=""month"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,33.0
"sns.factorplot(x=""year"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,75.0
"sns.factorplot(x=""day"",y='count',kind='bar',data=df,size=5,aspect=1)'",No,5,33.0
"plt.scatter(x=""temp"",y=""count"",data=df,color='#ff4125')'",No,5,33.0
"new_df=df.copy()
new_df.temp.describe()
new_df['temp_bin']=np.floor(new_df['temp'])//5
new_df['temp_bin'].unique()
# now we can visualize as follows
sns.factorplot(x=""temp_bin"",y=""count"",data=new_df,kind='bar')'",No,4,33.0
df.columns.to_series().groupby(df.dtypes).groups,No,3,40.0
"x_train,x_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.25,random_state=42)",No,4,13.0
"models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),KNeighborsRegressor()]
model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','KNeighborsRegressor']
rmsle=[]
d={}
for model in range (len(models)):
    clf=models[model]
    clf.fit(x_train,y_train)
    test_pred=clf.predict(x_test)
    rmsle.append(np.sqrt(mean_squared_log_error(test_pred,y_test)))
d={'Modelling Algo':model_names,'RMSLE':rmsle}   
d",Yes,3,7.0
"rmsle_frame=pd.DataFrame(d)
rmsle_frame",No,4,12.0
"sns.factorplot(y='Modelling Algo',x='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)",No,4,33.0
"sns.factorplot(x='Modelling Algo',y='RMSLE',data=rmsle_frame,kind='point',size=5,aspect=2)",No,4,33.0
"no_of_test=[500]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(x_train,y_train)
pred=clf_rf.predict(x_test)
print((np.sqrt(mean_squared_log_error(pred,y_test))))'",Yes,3,6.0
clf_rf.best_params_,No,5,2.0
"n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_knn.fit(x_train,y_train)
pred=clf_knn.predict(x_test)
print((np.sqrt(mean_squared_log_error(pred,y_test))))",Yes,4,6.0
clf_knn.best_params_,No,5,2.0
"pred=clf_rf.predict(test_df.drop('datetime',axis=1))
d={'datetime':test['datetime'],'count':pred}
ans=pd.DataFrame(d)
ans.to_csv('answer.csv',index=False)",Yes,4,25.0
"# train  .
train = pd.read_csv(""../input/train.csv"")

# train   .
print(train.shape)

# train  5 .
train.head()'",Yes,4,45.0
"# test  
test = pd.read_csv(""../input/test.csv"")

# test   .
print(test.shape)

# test  5 .
test.head()'",Yes,4,45.0
"#   .
# train[""datetime""].dt.year
#     .
#  String   .
# Datetime   .

# datetime   .
train[""datetime""] = pd.to_datetime(train[""datetime""])'",No,5,16.0
"#     train   .
print(train.shape)

# ""     ""  .
train[""datetime-year""] = train[""datetime""].dt.year
train[""datetime-month""] = train[""datetime""].dt.month
train[""datetime-day""] = train[""datetime""].dt.day
train[""datetime-hour""] = train[""datetime""].dt.hour
train[""datetime-minute""] = train[""datetime""].dt.minute
train[""datetime-second""] = train[""datetime""].dt.second

# 20180124 
train[""datetime-dayofweek""] = train[""datetime""].dt.dayofweek

#    train   .
print(train.shape)


#    .
train[[""datetime"", ""datetime-year"", ""datetime-month"", ""datetime-day"",
       ""datetime-hour"", ""datetime-minute"", ""datetime-second"", ""datetime-dayofweek""]].head()'",Yes,4,16.0
"# test datetime Type String datetime .
test[""datetime""] = pd.to_datetime(test[""datetime""])'",No,5,16.0
"#     train   .
print(test.shape)

# datetime           .
test[""datetime-year""] = test[""datetime""].dt.year
test[""datetime-month""] = test[""datetime""].dt.month
test[""datetime-day""] = test[""datetime""].dt.day
test[""datetime-hour""] = test[""datetime""].dt.hour
test[""datetime-minute""] = test[""datetime""].dt.minute
test[""datetime-second""] = test[""datetime""].dt.second

# 20180124 
test[""datetime-dayofweek""] = test[""datetime""].dt.dayofweek

#    train   .
print(test.shape)

#    .
test[[""datetime"", ""datetime-year"", ""datetime-month"", ""datetime-day"",
      ""datetime-hour"", ""datetime-minute"", ""datetime-second"", ""datetime-dayofweek""]].head()'",Yes,4,8.0
"import seaborn as sns

#   .
%matplotlib inline'",No,5,23.0
"sns.barplot(data=train, x=""weather"", y=""count"")",No,5,33.0
"sns.lmplot(data=train, x=""temp"", y=""atemp"")",No,5,33.0
"sns.distplot(train[""windspeed""])",No,5,33.0
"sns.barplot(data=train, x=""datetime-year"", y=""count"")",No,5,75.0
"sns.barplot(data=train, x=""datetime-month"", y=""count"")",No,5,75.0
"sns.barplot(data=train, x=""datetime-day"", y=""count"")",No,5,75.0
"sns.barplot(data=train, x=""datetime-hour"", y=""count"")",No,5,75.0
"sns.barplot(data=train, x=""datetime-minute"", y=""count"")",No,5,75.0
"sns.barplot(data=train, x=""datetime-second"", y=""count"")",No,5,33.0
"# Integer String   .
train[""datetime-year_month""] = train[""datetime-year""].astype(str) + ""-"" + train[""datetime-month""].astype(str)

print(train.shape)
train.info()'",Yes,3,40.0
"train[[""datetime-year"", ""datetime-month"", ""datetime-year_month""]]",No,5,41.0
"sns.barplot(data=train, x=""datetime-year_month"", y=""count"")",No,5,33.0
"#        !

import matplotlib.pyplot as plt

plt.figure(figsize=(24,4))
sns.barplot(data=train, x=""datetime-year_month"", y=""count"")'",Yes,5,33.0
"plt.figure(figsize=(24,4))
sns.pointplot(data=train, x=""datetime-hour"", y=""count"")",No,5,33.0
"plt.figure(figsize=(24,4))
sns.pointplot(data=train, x=""datetime-hour"", y=""count"", hue=""workingday"")",No,5,75.0
"plt.figure(figsize=(24,4))
sns.pointplot(data=train, x=""datetime-hour"", y=""count"", hue=""datetime-dayofweek"")",No,5,33.0
"#    x_train .
x_train = train[feature_names]

# x_train   .
print(x_train.shape)

# x_test    5 .
x_train.head()'",Yes,4,41.0
"#    x_test .
x_test = test[feature_names]

# x_test   .
print(x_test.shape)

# x_test    5 .
x_test.head()'",Yes,4,41.0
"#   .
label_name = ""count""
y_train = train[label_name]

print(y_train.shape)
y_train.head()'",Yes,4,58.0
"from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model",Yes,5,4.0
"from sklearn.model_selection import cross_val_predict

y_predict = cross_val_predict(model, x_train, y_train, cv=20)

print(y_predict.shape)
y_predict",Yes,4,27.0
"score = abs(y_train - y_predict).mean()

f""score(Mean Absolute Error)={score:.6f}""",No,5,28.0
"#  .
model.fit(x_train, y_train)'",No,5,7.0
"# x_test .
predictions = model.predict(x_test)

# array .
predictions'",No,5,48.0
"#   
predictions[:5]'",No,3,41.0
"submit = pd.read_csv(""../input/sampleSubmission.csv"")

print(submit.shape)
submit.head()",Yes,3,45.0
"submit[""count""] = predictions

print(submit.shape)
submit.head()",Yes,3,41.0
"submit.to_csv(""baseline-script.csv"", index=False)
pd.read_csv(""baseline-script.csv"").head()",Yes,4,25.0
"df_train = pd.read_csv('../input/train.csv')
df_train.head()",Yes,4,45.0
dtIdx = pd.DatetimeIndex(df_train['datetime']),No,5,16.0
"df_train['hour'] = dtIdx.hour
df_train['dayofweek'] = dtIdx.dayofweek
df_train['month'] = dtIdx.month
df_origin = df_train",No,4,16.0
"df_train = df_train.drop(['casual', 'registered', 'datetime'], axis = 1)
df_train.head()",Yes,5,10.0
"df_train_data = df_train.drop('count', axis=1)",No,5,10.0
df_train_target = df_train['count'],No,2,13.0
df_train_target.head(),No,5,41.0
"from sklearn import linear_model
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection",No,5,22.0
"ms = model_selection.ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)",No,5,13.0
df_train_data.head(),No,5,41.0
"X_train, X_test, y_train, y_test = model_selection.\\
    train_test_split(df_train_data, df_train_target, test_size = 0.2, random_state=0)'",No,5,13.0
\,No,5,6.0
"import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """"""
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the ""fit"" and ""predict"" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(""Training examples"")
    plt.ylabel(""Score"")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color=""r"")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color=""g"")
    plt.plot(train_sizes, train_scores_mean, 'o-', color=""r"",
             label=""Training score"")
    plt.plot(train_sizes, test_scores_mean, 'o-', color=""g"",
             label=""Cross-validation score"")

    plt.legend(loc=""best"")
    return plt

# cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
# title = ""Learning Curves (RFR, n_estimators=100)""
# estimator = RandomForestRegressor(n_estimators=100)
# plot_learning_curve(estimator, title, 
#                     df_train_data, df_train_target, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
# plt.show()'",Yes,4,35.0
df_origin.columns,No,5,71.0
"df_origin.groupby('windspeed').mean().plot(y='count', marker='o')",No,3,33.0
"df_origin.groupby('humidity').mean().plot(y='count', marker='o')",No,4,33.0
d = df_origin.groupby('humidity'),No,5,60.0
"corr = df_origin[['temp','weather','windspeed','dayofweek', 'month', 'hour','count']].corr()
corr",No,5,40.0
"import matplotlib.pyplot as plt
plt.figure()
plt.matshow(corr)
plt.colorbar()
plt.show()",Yes,5,80.0
"df_test = pd.read_csv('../input/test.csv')
df_test.head()",Yes,4,45.0
"df_sample = pd.read_csv('../input/sampleSubmission.csv')
df_sample.head()",Yes,4,45.0
"df_test['hour'] = pd.DatetimeIndex(df_test['datetime']).hour
df_test['dayofweek'] = pd.DatetimeIndex(df_test['datetime']).dayofweek
df_test['month'] = pd.DatetimeIndex(df_test['datetime']).month
df_test_data = df_test.drop(['datetime'], axis=1)
df_test_data.head()",Yes,4,8.0
"score = rfr.score(df_train_data, df_train_target)",No,5,28.0
"print(""score: %.3f""%score)",No,5,84.0
df_sample.head(),No,5,41.0
df_sample.info(),No,5,40.0
"df_sample.to_csv('submission.csv', index=False)",No,5,25.0
"df_demo = pd.read_csv('submission.csv')
df_demo.head()",No,4,45.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

print(os.listdir(""../input""))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
%matplotlib inline

data = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")
test.dtypes

",Yes,3,45.0
"print(test.head(5))
",No,5,41.0
"# Extract hours from datetime
data['datetime'] = pd.to_datetime(data['datetime'])
data['hour'] = data['datetime'].dt.hour
data['month'] = data['datetime'].dt.month

test['datetime'] = pd.to_datetime(test['datetime'])
test['hour'] = data['datetime'].dt.hour
test['month'] = data['datetime'].dt.month

data['season'] = data.season.astype('category')
data['month'] = data.month.astype('category')
data['hour'] = data.hour.astype('category')
data['holiday'] = data.holiday.astype('category')
data['workingday'] = data.workingday.astype('category')
data['weather'] = data.weather.astype('category')


test['season'] = test.season.astype('category')
test['month'] = test.month.astype('category')
test['hour'] = test.hour.astype('category')
test['holiday'] = test.holiday.astype('category')
test['workingday'] = test.workingday.astype('category')
test['weather'] = test.weather.astype('category')


data.dtypes",No,3,16.0
"data = data.drop(['atemp', 'casual', 'registered', 'windspeed'], axis=1)
test = test.drop(['atemp','windspeed'], axis=1)",No,5,10.0
test.head(2),No,5,41.0
"import math
data['count'] = data['count'].transform(lambda x: math.log(x))",Yes,5,8.0
"data = data.drop(['datetime'], axis=1)
data_dummy = data

#test = test.drop(['datetime'], axis=1)
test_dummy = test

def dummify_dataset(df, column):       
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)
    df = df.drop([column], axis=1)
    return df

columns_to_dummify = ['season', 'month', 'hour', 'holiday', 'workingday', 'weather']
for column in columns_to_dummify:
    data_dummy = dummify_dataset(data_dummy, column)
    test_dummy = dummify_dataset(test_dummy, column)
    

test_dummy.head(5)",Yes,5,20.0
"from sklearn.model_selection import train_test_split

y = data_dummy['count']
X = data_dummy.drop(['count'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)
",Yes,4,21.0
"from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV
from sklearn.metrics import mean_squared_log_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
etr = ExtraTreesRegressor(max_depth= 20, n_estimators= 500)
#etr.fit(X_train, y_train)
#Y_to_train = train_sample[""count""]
#X_to_train = train_sample.drop(['count'], axis=1)

etr.fit(X_train,y_train)
#y_pred = etr.predict(test_sample)'",Yes,5,7.0
"test_with_datetime = pd.read_csv(""../input/test.csv"")
test_dummy = test_dummy.drop(['datetime'], axis=1)
test_predictions = etr.predict(test_dummy)'",Yes,3,48.0
"predictions =  np.exp(test_predictions )
submission = pd.DataFrame({ 'datetime': test.datetime.values, 'count': predictions })
submission.to_csv(""my_submission_10.csv"", index=False)'",Yes,5,25.0
"train = pd.read_csv('../input/train.csv')
train.head()",Yes,4,45.0
"test = pd.read_csv('../input/test.csv')
test.head()",Yes,4,45.0
"train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""])
test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""])",No,5,45.0
train.dtypes,No,5,70.0
"train[""year""] = train[""datetime""].dt.year
train[""hour""] = train[""datetime""].dt.hour
train[""dayofweek""] = train[""datetime""].dt.dayofweek

test[""year""] = test[""datetime""].dt.year
test[""hour""] = test[""datetime""].dt.hour
test[""dayofweek""] = test[""datetime""].dt.dayofweek",No,5,8.0
"train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True)
test.drop([""datetime"", ""windspeed""], 1, inplace=True)",No,5,10.0
"from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100)
rf.fit(train,y_train)
preds = rf.predict(test)",Yes,4,7.0
"submission = pd.read_csv('../input/sampleSubmission.csv')
submission.head()",Yes,4,45.0
"submission[""count""] = np.expm1(preds)
submission.head()",Yes,5,55.0
"submission.to_csv(""allrf.csv"", index=False)",No,5,25.0
"sample=pd.read_csv('../input/sampleSubmission.csv')
train_df=pd.read_csv('../input/train.csv')
test_df=pd.read_csv('../input/test.csv')",No,5,45.0
sample.head(),No,5,41.0
train_df.nunique(),No,5,54.0
test_df.nunique(),No,5,54.0
train_df.describe(),No,5,40.0
season_df=train_df.groupby('season'),No,5,60.0
season_df.head(),No,5,41.0
"train_df['hour']=hour
train_df['day']=day
train_df['month']=month
train_df['year']=year",No,2,16.0
"hour=[]
day=[]
month=[]
year=[]
for row in test_df['datetime']:
    date_hour=row.split()
    date=date_hour[0]
    hour_row=date_hour[1]
    hour.append(hour_row.split(':')[0])
    date=date.split('-')
    day.append(date[2])
    month.append(date[1])
    year.append(date[0])
test_df['hour']=hour
test_df['day']=day
test_df['month']=month
test_df['year']=year",No,5,8.0
"datetime=['hour','day','month','year']
for time in datetime:
    train_df[time]=train_df[time].astype(int)
    test_df[time]=test_df[time].astype(int)",No,4,16.0
"#Continous Features Analysis
for i in range(len(cont_feat)-1):
    for j in range(i+1,len(cont_feat)):
        sns.jointplot(cont_feat[i],cont_feat[j],data=train_df)
        plt.title('{} relation with {}'.format(cont_feat[i],cont_feat[j]))
        plt.show()
        ",No,5,33.0
"#Categorical feature analysis
for cat in cat_feat:
    sns.barplot(x=cat,y='count',data=train_df,estimator=sum)
    plt.title('{} vs total_rent'.format(cat))
    plt.show()",No,4,81.0
"climate=['temp','humidity','windspeed']
for clim in climate:
    sns.swarmplot(x='hour',y=clim,hue='season',data=train_df)
    plt.title('{} vs {}'.format('hour',clim))
    plt.show()",No,5,33.0
"sns.distplot(train_df['count'])
train_df['count']=train_df['count'].apply(lambda x:np.log(x))",No,4,33.0
sns.heatmap(train_df.corr()),No,5,80.0
train_df=pd.DataFrame(train_df),No,5,12.0
"train_df.set_index('datetime',inplace=True)",No,5,61.0
"test_df.set_index('datetime',inplace=True)",No,4,61.0
"train_df.drop(columns=['casual','registered'],axis=1,inplace=True)",No,4,61.0
"weather_df=pd.get_dummies(train_df['weather'],prefix='weather')
yr_df=pd.get_dummies(train_df['year'],prefix='year')
month_df=pd.get_dummies(train_df['month'],prefix='month')
hour_df=pd.get_dummies(train_df['hour'],prefix='hour')
season_df=pd.get_dummies(train_df['season'],prefix='season')
train_df=train_df.join(weather_df)
train_df=train_df.join(yr_df)
train_df=train_df.join(month_df)                     
train_df=train_df.join(hour_df)
train_df=train_df.join(season_df)
                     
weather_df=pd.get_dummies(test_df['weather'],prefix='weather')
yr_df=pd.get_dummies(test_df['year'],prefix='year')
month_df=pd.get_dummies(test_df['month'],prefix='month')
hour_df=pd.get_dummies(test_df['hour'],prefix='hour')
season_df=pd.get_dummies(test_df['season'],prefix='season')
test_df=test_df.join(weather_df)
test_df=test_df.join(yr_df)
test_df=test_df.join(month_df)                     
test_df=test_df.join(hour_df)
test_df=test_df.join(season_df)",No,4,20.0
"train_df.drop(columns=['season','hour','month','year','weather'],axis=1,inplace=True)
test_df.drop(columns=['season','hour','month','year','weather'],axis=1,inplace=True)",No,5,10.0
"def rmlse(predicted,actual):
    sum_val=0
    for i in range(len(predicted)):
        sum_val+=(np.log(predicted[i]+1)-np.log(actual[i]+1))**2
    return (sum_val/len(predicted))**(0.5)",No,5,84.0
"X=train_df.drop(columns='count',axis=1)
y=train_df['count']",No,5,21.0
X.info(),No,5,40.0
"from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)
'''
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
'''
'''
param_test2={
    'gamma':[0,0.125,0.25,0.5,0.75,1]
}
'''
'''
param_test3={
    'min_child_weight':[1,2,3,4,5,6,7,8,9]
}
'''
'''
param_test4={
    'learning_rate':[0.1,0.01,0.001]
}
'''
param_test5={
    'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.125, n_estimators=1000, max_depth=9,
 min_child_weight=4, gamma=0.125, subsample=0.8, colsample_bytree=0.8,
 random_state=42), 
param_grid = param_test5,n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
predicted=gsearch1.predict(X_test)
print('Model Score: {}'.format(rmlse(np.exp(predicted),np.exp(y_test))))
print(gsearch1.best_params_)",No,4,6.0
y_test,Yes,5,41.0
"import xgboost as xgb
xgr=xgb.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=9,
 min_child_weight=4, gamma=0.125, subsample=1, colsample_bytree=0.8)
xgr.fit(train_df.drop(columns='count',axis=1),train_df['count'])
y_predict=xgr.predict(test_df)",No,4,7.0
test_df['count']=np.exp(y_predict),Yes,5,8.0
result=pd.DataFrame(),No,5,12.0
"result['datetime']=test_df['datetime']
result['count']=test_df['count']",No,5,55.0
"result.to_csv('output.csv',index=False)",No,5,25.0
"sns.pointplot(x=df['temp'], y=df['count'])
fig = plt.gcf()
fig.set_size_inches(30,12);",No,5,33.0
"sns.pointplot(x=df['atemp'], y=df['count'])
fig = plt.gcf()
fig.set_size_inches(30,12);",No,5,33.0
"_, _, r_value, _, _ = stats.linregress(df['count'], df['atemp'])
r_square = r_value ** 2
r_square.round(2)",No,5,47.0
"sns.pointplot(x=df['hour'], y=df['count'])
fig = plt.gcf()
fig.set_size_inches(30,12);",No,5,75.0
"sns.pointplot(x=df['temp'], y=df['atemp'])
fig = plt.gcf()
fig.set_size_inches(30,12);",No,5,33.0
"_, _, r_value, _, _ = stats.linregress(df['temp'], df['atemp'])
r_square = r_value ** 2
r_square.round(2)",No,5,47.0
"df = df.drop(labels='atemp', axis=1)",No,5,10.0
"df = df.drop(labels='count_log', axis=1)",No,5,10.0
"df = df.drop(labels='count_boxcox', axis=1)",No,5,10.0
"df = pd.get_dummies(df, columns=['weather'])
df.head()",No,5,20.0
"df = df.drop(labels='weather_4', axis=1)
df.head()",No,5,10.0
"df['temp_weath_1'] = df['temp'] * df['weather_1']
df['temp_weath_2'] = df['temp'] * df['weather_2']
df['temp_weath_3'] = df['temp'] * df['weather_3']",No,5,8.0
"df['temp_weath_1'] = df['temp_weath_1'].astype(int)
df['temp_weath_2'] = df['temp_weath_2'].astype(int)
df['temp_weath_3'] = df['temp_weath_3'].astype(int)",No,5,16.0
"X = df.loc[:, df.columns != 'count']
y = np.log(df['count'])",No,5,21.0
"X.shape, y.shape",No,5,58.0
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)",No,5,13.0
"X_train.shape, y_train.shape, X_test.shape, y_test.shape",No,5,58.0
"from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, minmax_scale, QuantileTransformer, RobustScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_score

from xgboost import XGBRegressor",No,5,22.0
"pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('LASSO', Lasso(random_state=42))])))
pipelines.append(('ScaledRID', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RID', Ridge(random_state=42))])))
pipelines.append(('ScaledKNN', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor(n_neighbors=2))])))
pipelines.append(('ScaledCART', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor(random_state=42))])))
pipelines.append(('ScaledGBM', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor(random_state=42))])))
pipelines.append(('ScaledRFR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RFR', RandomForestRegressor(random_state=42))])))
pipelines.append(('ScaledSVR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('SVR', SVR(kernel='linear'))])))
pipelines.append(('ScaledXGBR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('XGBR', XGBRegressor(random_state=42))])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(random_state=42)
    cv_results = -cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_log_error')
    results.append(np.sqrt(cv_results))
    names.append(name)
    msg = ""{}: {} ({})"".format(name, cv_results.mean(), cv_results.std())
    print(msg)'",No,5,79.0
"df_test = pd.read_csv(""../input/test.csv"")",No,5,45.0
df_test['datetime'] = pd.to_datetime(df_test['datetime']),No,5,16.0
df_test['dow'] = df_test['datetime'].dt.dayofweek,No,5,8.0
df_test['month'] = df_test['datetime'].dt.month,No,5,8.0
df_test['week'] = df_test['datetime'].dt.week,No,5,8.0
df_test['hour'] = df_test['datetime'].dt.hour,No,5,8.0
df_test['year'] = df_test['datetime'].dt.year,No,5,8.0
df_test['day'] = df_test['datetime'].dt.day,No,5,8.0
"df_test = df_test.drop(labels='datetime', axis=1)",No,5,10.0
"df_test = df_test.drop(labels='atemp', axis=1)",No,5,10.0
"df_test = pd.get_dummies(df_test, columns=['weather'])",No,5,20.0
"df_test = df_test.drop(labels='weather_4', axis=1)",No,5,10.0
"df_test['temp_weath_1'] = df_test['temp'] * df_test['weather_1']
df_test['temp_weath_2'] = df_test['temp'] * df_test['weather_2']
df_test['temp_weath_3'] = df_test['temp'] * df_test['weather_3']",No,5,8.0
"df_test['temp_weath_1'] = df_test['temp_weath_1'].astype(int)
df_test['temp_weath_2'] = df_test['temp_weath_2'].astype(int)
df_test['temp_weath_3'] = df_test['temp_weath_3'].astype(int)",No,5,16.0
"standardscaler = StandardScaler()
model = XGBRegressor(colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7, random_state=42)",No,5,4.0
"model.fit(X_train, y_train)",No,5,7.0
model.predict(df_test),No,5,48.0
"pipe = Pipeline([('poly', PolynomialFeatures()), ('StandardScaler', standardscaler), ('XGBR', model)])
pipe.fit(X_train, y_train)
y_pred = np.exp(pipe.predict(df_test))
y_pred",Yes,4,7.0
"df_test[['count']].to_csv('submission.csv', index=True)",No,5,25.0
df_test[['count']].head(),No,5,41.0
"# Carregar os dados
df = pd.read_csv('../input/train.csv', parse_dates=[0])
test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0
"b""# Transformao da coluna datetime (feature engineering)\ndf['year'] = df['datetime'].dt.year\ndf['month'] = df['datetime'].dt.month\ndf['day'] = df['datetime'].dt.day\ndf['dayofweek'] = df['datetime'].dt.dayofweek\ndf['hour'] = df['datetime'].dt.hour""",No,5,8.0
"# Ordenar os dados pela coluna datetime
df.sort_values('datetime', inplace=True)",No,5,9.0
"# Separando os dataframes
test = df[df['count'].isnull()]
df = df[~df['count'].isnull()]",No,5,13.0
"# Separando o df em treino e validao
from sklearn.model_selection import train_test_split'",No,5,22.0
"train, valid = train_test_split(df, random_state=42)",No,5,13.0
"# Usar o modelo de RandomForest

# Importar o modelo
from sklearn.ensemble import RandomForestRegressor",No,5,22.0
"# Instanciar o modelo
rf = RandomForestRegressor(random_state=42)",No,5,4.0
"# Treinar o modelo
rf.fit(train[feats], train['count'])",No,5,7.0
"# Fazendo as previses
preds = rf.predict(valid[feats])'",No,5,48.0
"# Analisar as previses com base na mtrica

# Importando a mtrica
from sklearn.metrics import mean_squared_error'",No,5,22.0
"b""# Validando as previses\nmean_squared_error(valid['count'], preds) ** (1/2)""",No,5,49.0
"# Melhorando o modelo de RandomForest
rf = RandomForestRegressor(random_state=42, n_estimators=200, n_jobs=-1)",No,5,4.0
"# Preparando os dados para o kaggle

# Criando as previses para os dados de teste
preds_test = rf.predict(test[feats])'",No,5,48.0
"# Salvando o arquivo para o Kaggle
test[['datetime', 'count']].to_csv('rf.csv', index=False)",No,5,25.0
"df = pd.read_csv('../input/train.csv', parse_dates=[0])
test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0
"df.rename(columns={'count':'rentals'},inplace=True)",No,5,61.0
"df = df.append(test, sort=False)",No,5,11.0
"df['year'] = df.datetime.dt.year # df['datetime'].dt.year
df['month'] = df.datetime.dt.month
df['day'] = df.datetime.dt.day
df['dayofweek'] = df.datetime.dt.dayofweek
df['hour'] = df['datetime'].dt.hour",No,5,8.0
df.sort_index(inplace=True),No,5,9.0
"# Separando os dataframes
test = df[df['rentals'].isnull()]
df = df[~df['rentals'].isnull()]",No,5,13.0
"from sklearn.model_selection import train_test_split

train, valid = train_test_split(df, random_state=42)",No,5,13.0
"from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, n_estimators=100,n_jobs=-1)",Yes,4,4.0
"rf.fit(train[feats],train['rentals'])",No,5,7.0
"# Fazer as previses
preds = rf.predict(valid[feats])'",No,4,48.0
"from sklearn.metrics import mean_squared_error

mean_squared_error(valid['rentals'],preds)**(1/2)",Yes,4,49.0
"b""# Adicionar as previses ao dataframe\ntest['count'] = np.exp(preds_test)-1\n""",No,4,8.0
"test[['datetime','count']]",No,5,41.0
"train, valid = df[df['day'] <= 15], df.query('day > 15')",No,5,14.0
"# Preparando os dados para o kaggle

# Criando as previses para os dados de teste

preds_test = rf.predict(test[feats])'",No,5,48.0
"# Salvando o arquivo pro kaggle
test[['datetime','count']].to_csv('rf2.csv', index=False)
",No,5,25.0
"from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, n_estimators=100,n_jobs=-1,oob_score=True)",No,5,4.0
"rf.fit(df[feats],df['rentals'])
rf.oob_score_",No,4,7.0
"# Fazer as previses
preds = rf.oob_prediction_'",Yes,4,22.0
"from sklearn.metrics import mean_squared_error

mean_squared_error(df['rentals'],preds)**(1/2)",Yes,4,7.0
"# Salvando o arquivo pro kaggle
test[['datetime','count']].to_csv('rf3.csv', index=False)
",No,5,25.0
" def cv(df, test, feats, y_name, k=5):
        score, preds, fis = [], [], []
        chunk = df.shape[0] // k
        
        for i in range(k):
            if i+1 < k:
                valid = df.iloc[i*chunk: (i+1)*chunk]
                train = df.iloc[:i*chunk].append(df.iloc[(i+1)*chunk:])
            else:
                valid: df.iloc[i*chunk:]
                train: df.iloc[:i*chunk]
                    
            rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=100)
            rf.fit(train[feats],train[y_name])
            
            score.append(mean_squared_error(valid[y_name],rf.predict(valid[feats])) ** (1/2))
            
            preds.append(rf.predict(test[feats]))
            
            fis.append(rf.feature_importances_)
            
            print(i, 'OK')
        return score, preds, fis",No,2,7.0
"score, preds, fis = cv(df, test, feats, 'rentals')",No,5,49.0
score,No,2,7.0
"pd.Series(score).mean()
",No,5,7.0
"test['count'] = np.exp(pd.DataFrame(preds).mean())-1
test[['datetime','count']].to_csv('rf4.csv', index=False)
",No,5,25.0
"import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime",No,5,22.0
"import warnings  
warnings.filterwarnings('ignore')",No,5,23.0
"HOME_PATH = r'../input'
os.listdir(HOME_PATH)",No,5,88.0
train_set = pd.read_csv(HOME_PATH+'/train.csv'),No,5,45.0
"def month_to_num(df):
    """""" Convert month to numerical """"""
    for i in range(1,10):
        df['month'].loc[df['month']=='0'+str(i)] = i
    for i in range(10,13):
        df['month'].loc[df['month']==str(i)] = i'",No,5,20.0
"def time_to_num(df):
    """""" Convert time to numerical """"""
    for i in range(0,10):
        df['time'].loc[df['time']=='0'+str(i)+':00:00'] = i
    for i in range(10,24):
        df['time'].loc[df['time']==str(i)+':00:00'] = i'",No,5,20.0
"def weekend(df):
    df['weekend'] = np.zeros_like(df['holiday'])
    df['weekend'].loc[(df['workingday'] == 0) & (df['holiday'] == 0)] = 1",No,5,8.0
"def weekday(df):
    df['weekday'] = df['datetime'].apply(lambda date : \\
                                         datetime.datetime.strptime(str(date.split()[0]),""%Y-%m-%d"").weekday())'",No,5,8.0
"def process_df(df):
    split_datetime(df)
    round_temp(df)
#     month_to_num(df)
    time_to_num(df)
    weekend(df)
    weekday(df)
    return df.drop('datetime', axis=1)",Yes,4,8.0
"fig = plt.figure(figsize=(10, 4))
fig.add_subplot(1,2,1)
sns.countplot(x='year', hue='count_bin', data=train_set.loc[train_set['year']==1])
fig.add_subplot(1,2,2)
sns.countplot(x='year', hue='count_bin', data=train_set.loc[train_set['year']==2])",No,5,33.0
"fig = plt.figure(figsize=(20, 6))
fig.add_subplot(1,2,1)
sns.countplot(x='weekday', hue='count_bin', data=train_set)
fig.add_subplot(1,2,2)
sns.countplot(x='time', hue='count_bin', data=train_set)",No,5,75.0
"sns.factorplot(x=""weekday"",y=""count"",data=train_set,kind='bar')
sns.factorplot(x=""time"",y=""count"",data=train_set,kind='bar')'",No,5,75.0
"# fig = plt.figure(figsize=(10, 4))
# fig.add_subplot(1,2,1)
sns.factorplot(x=""weekend"",y=""count"",data=train_set,kind='bar')
# sns.countplot(x='weekend', hue='count_bin', data=train_set.loc[train_set['weekend']==1])
# fig.add_subplot(1,2,2)
sns.factorplot(x=""workingday"",y=""count"",data=train_set,kind='bar')
# sns.countplot(x='workingday', hue='count_bin', data=train_set.loc[train_set['workingday']==1])'",No,5,33.0
"sns.factorplot(x=""rounded_temp"",y=""count"",data=train_set,kind='bar')'",No,5,33.0
"sns.factorplot(x=""rounded_atemp"",y=""count"",data=train_set,kind='bar')'",No,5,33.0
"fig = plt.figure()
fig.add_subplot(2,2,1)
plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==1)])

fig.add_subplot(2,2,2)
plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==2)])

fig.add_subplot(2,2,3)
plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==3)])

fig.add_subplot(2,2,4)
plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==4)])",No,5,33.0
"sns.factorplot(x=""season"",y=""count"",data=train_set,kind='bar')'",No,5,33.0
"sns.factorplot(x=""weather"",y=""count"",data=train_set,kind='bar')'",No,5,33.0
"sns.factorplot(x=""temp_bin"",y=""count"",data=train_set,kind='bar')'",No,5,33.0
"from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, make_scorer",No,5,22.0
"train_set = pd.get_dummies(train_set, columns=['season', 'weather', 'weekday', 'holiday'])
# train_set = pd.get_dummies(train_set, columns=['season', 'weather', 'weekday', 'year', 'month', 'time'])",No,5,20.0
"train_set['temp_weather_1'] = train_set['temp'] * train_set['weather_1']
train_set['temp_weather_2'] = train_set['temp'] * train_set['weather_2']
train_set['temp_weather_3'] = train_set['temp'] * train_set['weather_3']
train_set['temp_weather_4'] = train_set['temp'] * train_set['weather_4']",No,5,8.0
"y = train_set.loc[:, 'count']
X = train_set.drop(['count', 'count_bin', 'casual', 'registered'], axis=1) ",No,5,21.0
"rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=200, oob_score=True, min_samples_split=4, max_features=0.9, max_depth=17)
rf.fit(X, y)",No,5,7.0
"scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_log_error')
print(""score, root score: "", scores, np.sqrt(np.abs(scores)))'",No,5,28.0
"d_importance = pd.DataFrame(columns=['features'], data=X.columns)
d_importance['importance'] = rf.feature_importances_
d_importance.sort_values(by='importance',ascending=False).head(20)",No,5,79.0
"test_set = pd.read_csv(HOME_PATH+'/test.csv')
y_test = test_set['datetime']
test_set = process_df(test_set)
test_set = pd.get_dummies(test_set, columns=['season', 'weather', 'weekday', 'holiday'])",Yes,2,45.0
"test_set['temp_weather_1'] = test_set['temp'] * test_set['weather_1']
test_set['temp_weather_2'] = test_set['temp'] * test_set['weather_2']
test_set['temp_weather_3'] = test_set['temp'] * test_set['weather_3']
test_set['temp_weather_4'] = test_set['temp'] * test_set['weather_4']

test_set['temp_bin'] = np.floor(test_set['temp'])//5

# test_set['high_time'] = np.zeros_like(test_set['time'])
# test_set['high_time'].loc[(((test_set['time'] > 6) & (test_set['time'] < 15)) | (test_set['time'] == 20))] = 1
# test_set['high_time'].loc[((test_set['time'] == 8) | (test_set['time'] == 16) | (test_set['time'] == 19))] = 2
# test_set['high_time'].loc[((test_set['time'] == 17) | (test_set['time'] == 18))] = 3",No,5,8.0
test_set.head(),No,5,41.0
"predictions = np.zeros_like(y_test)
predictions = (rf.predict(test_set)).round().astype(int)
predictions[predictions < 0] = 0

submission = pd.concat([y_test, pd.Series(predictions, name=""count"")], axis=1)
print(submission.head(30))

submission.to_csv(""submission.csv"", index=False)",Yes,3,25.0
"import numpy as np
import pandas as pd
train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""])
test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""])

train[""year""] = train[""datetime""].dt.year
train[""hour""] = train[""datetime""].dt.hour
train[""dayofweek""] = train[""datetime""].dt.dayofweek

test[""year""] = test[""datetime""].dt.year
test[""hour""] = test[""datetime""].dt.hour
test[""dayofweek""] = test[""datetime""].dt.dayofweek

y_casual = np.log1p(train.casual)
y_registered = np.log1p(train.registered)
#y_train = np.log1p(train[""count""])

train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True)
test.drop([""datetime"", ""windspeed"", ], 1, inplace=True)
import lightgbm as lgb
hyperparameters = { 'colsample_bytree': 0.725,  'learning_rate': 0.013,
                    'num_leaves': 56, 'reg_alpha': 0.754, 'reg_lambda': 0.071, 
                    'subsample': 0.523, 'n_estimators': 1093}
model = lgb.LGBMRegressor(**hyperparameters)
model.fit(train, y_casual)
preds1 = model.predict(test)

hyperparameters = { 'colsample_bytree': 0.639,  'learning_rate': 0.011,
                    'num_leaves': 30, 'reg_alpha': 0.351, 'reg_lambda': 0.587,
                   'subsample': 0.916, 'n_estimators': 2166}
model = lgb.LGBMRegressor(**hyperparameters, )
model.fit(train, y_registered)
preds2 = model.predict(test)

submission=pd.read_csv(""../input/sampleSubmission.csv"")
submission[""count""] = np.expm1(preds1) + np.expm1(preds2)
#submission.to_csv(""allrf.csv"", index=False)'",Yes,1,22.0
"pd.options.display.max_rows = 200
submission[""holiday""] = test[""holiday""]
submission.loc[(submission[""holiday""]==1)]",No,4,55.0
"# Filter cover type and then barplot of wilderness area to see if any trees grow exclusively in a region.
#data.describe()
data = dtrain.groupby(['Cover_Type'])[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum()
# Transpose to get numbers by wilderness type.
data.T.plot(kind = 'bar', figsize = (12,8))
plt.show()",Yes,3,8.0
"# Drop Soil type 15,7 - They have no variation. 
dtrain.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)
# filtering all columns that contain the str Soil
soil_columns = dtrain.columns[dtrain.columns.str.contains('Soil')].to_list()",Yes,3,10.0
"data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[:10]].sum()
data_soil.T.plot(kind = 'bar', figsize = (18,8))
plt.show()",Yes,4,8.0
"data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[10:20]].sum()
data_soil.T.plot(kind = 'bar', figsize = (18,8))
plt.show()",Yes,4,8.0
"data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[20:30]].sum()
data_soil.T.plot(kind = 'bar', figsize = (18,8))
plt.show()",Yes,4,8.0
"data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[30:]].sum()
data_soil.T.plot(kind = 'bar', figsize = (18,8))
plt.show()",Yes,4,8.0
"label = dtrain['Cover_Type']
dtrain.drop(['Cover_Type'], axis = 1, inplace=True)",No,5,10.0
"from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier",No,5,22.0
"x_train, x_test, y_train, y_test = train_test_split(dtrain, label, test_size = .3)
dirty_clf = RandomForestClassifier()
dirty_clf.fit(x_train, y_train)
print(dirty_clf.score(x_test, y_test))
imp_feat = pd.DataFrame(index= dtrain.columns.to_list() , data= dirty_clf.feature_importances_)
imp_feat.rename(columns={0 : 'Importance'}, inplace=True)
imp_feat.sort_values(by='Importance', axis =0, ascending=False)[:15]",Yes,2,4.0
"baseline_features = ['Elevation', 'Horizontal_Distance_To_Roadways']
features = ['Elevation', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Hydrology',
            'Horizontal_Distance_To_Fire_Points', 'Aspect','Wilderness_Area1', 'Wilderness_Area4', 'Soil_Type3',
            'Soil_Type4','Soil_Type10', 'Soil_Type29',
            'Soil_Type38']
x_train, x_test, y_train, y_test = train_test_split(dtrain[features], label, test_size = .3)",Yes,3,21.0
"clf = DecisionTreeClassifier(criterion='gini', max_depth=8, min_samples_split=2, class_weight= None, max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,)",No,5,4.0
"grid_params = {'criterion' : [""gini"", ""entropy""]}
grid = GridSearchCV(estimator=clf, param_grid=grid_params, n_jobs=-1, cv = 5)
grid.fit(x_train, y_train)
grid.score(x_test, y_test)'",Yes,2,4.0
"grid.best_estimator_
y_pred = grid.predict(x_test)",No,5,48.0
"clf.fit(x_train, y_train)
print(f'No of Leaves : {clf.get_n_leaves()}')
clf.feature_importances_",Yes,2,7.0
"rnd_clf = RandomForestClassifier()
grid_params_1 = {'max_depth' : [18], 'n_estimators' : [127], 'criterion':['entropy']}
grid = GridSearchCV(estimator=rnd_clf, param_grid=grid_params_1, n_jobs=-1, cv = 5)
grid.fit(x_train, y_train)",Yes,4,4.0
"final_clf = RandomForestClassifier(max_depth=18, n_estimators=127, criterion='entropy')
final_clf.fit(x_train, y_train)
print(final_clf.score(x_train, y_train))
print(final_clf.score(x_test, y_test))
y_hat = final_clf.predict(x_test)",Yes,2,4.0
"plt.figure(figsize=(8,8))
sns.heatmap(pd.DataFrame(confusion_matrix(y_test, y_pred),
                         index = label_dict.values(), columns= label_dict.values()), annot=True, cbar = False)
plt.show()",No,5,80.0
"imp_feat = pd.DataFrame(index= features , data= final_clf.feature_importances_)
imp_feat.rename(columns={0 : 'Importance'}, inplace=True)
imp_feat.sort_values(by='Importance', axis =0, ascending=False)",Yes,2,12.0
"xgb_clf = XGBClassifier(n_estimators=100, max_depth = 12)",No,5,4.0
"xgb_clf.fit(x_train, y_train)
xgb_clf.score(x_test, y_test)",Yes,3,7.0
y_pred = xgb_clf.predict(x_test),No,5,48.0
"# Final Fit
xgb_clf.fit(dtrain[features], label)",No,5,7.0
y_test_hat = xgb_clf.predict(dtest[features]),No,5,48.0
sns.distplot(dtest.Elevation),No,5,33.0
"df_submit = pd.read_csv(submit_path, index_col=0)
df_submit['Cover_Type'] =y_test_hat
df_submit.to_csv('submit_kaggle.csv')",Yes,4,25.0
"

import os
#donnes
print(os.listdir(""../input""))
# Any results you write to the current directory are saved as output.
import pandas as pd
import numpy as np
#random forest pour regressor
from sklearn.ensemble import RandomForestRegressor
#Pour le split feature et target
from sklearn.model_selection import train_test_split
#Pour regression au lieu de accuracy c'est mean_square_error
from sklearn.metrics import mean_squared_error
#Ne pas afficher le warning lors du fit par exemple
#Import pour la cross_validation
from sklearn.model_selection  import cross_val_score
#import random forest pour regression
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')'",Yes,4,22.0
"# Fichier de train
X_train = pd.read_csv(""../input/train.csv"")
#Fichier de test
X_test = pd.read_csv(""../input/test.csv"")",No,5,45.0
"submission = pd.read_csv(""../input/sampleSubmission.csv"")
submission[""count""] = 195",Yes,3,25.0
"#Convertir notre fichier en csv
submission.to_csv('submission.csv', index=False)",No,5,25.0
X_test.columns,No,5,71.0
"## Definition foncyion features et target

def split_dataset(df, features, target='count'):
    X = df[features]
    y = df[target]
    return X, y
",No,5,21.0
"## Fonction split date
def date_split(df_train, df_test, date='datetime'):
    ##Traitement_df_train
    cols=df_train[date]
    date_cols=pd.to_datetime(cols)
    df_train['year'] = date_cols.dt.year
    df_train['month'] = date_cols.dt.month
    df_train['day'] = date_cols.dt.day
    df_train['hour'] = date_cols.dt.hour
    df_train['minute'] = date_cols.dt.minute
    df_train['second'] = date_cols.dt.second
    df_train = df_train.drop(['datetime'], axis=1)
    ##Traitement_df_test
    cols2=df_test[date]
    date_cols2=pd.to_datetime(cols2)
    df_test['year'] = date_cols2.dt.year
    df_test['month'] = date_cols2.dt.month
    df_test['day'] = date_cols2.dt.day
    df_test['hour'] = date_cols2.dt.hour
    df_test['minute'] = date_cols2.dt.minute
    df_test['second'] = date_cols2.dt.second
    df_test = df_test.drop(['datetime'], axis=1)
    return df_train, df_test
     
    
    ",Yes,4,21.0
"#Definition de X_train et X_test avec les memes columns
X_train, X_test = date_split(X_train, X_test)",No,5,13.0
"#Appel de la fonction pour avoir le meme nombre de columns:
X_trainGet_cols = Get_cols(X_test)

numbers = X_trainGet_cols.select_dtypes(np.number)

numbers.head()",Yes,2,41.0
"##Definition features and target

X_train_features, y_train_target = split_dataset(X_train, features=numbers.columns)

X_train_features, y_train_target",No,5,13.0
"##################Cross Validation
## random forest regressor
#Import Random Forest pour regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
##Cross Validation
score = -cross_val_score(rf, X_train_features, y_train_target, cv=5, scoring='neg_mean_squared_error')",No,3,22.0
"###############  FIT entrainer tout le set d'entrainement
rf.fit(X_train_features, y_train_target)",No,5,7.0
"## Predict sur le train
y_train_pred = rf.predict(X_train_features)",No,5,27.0
"###### Predict sur le test
## Voir si on a le meme nombre de columns dans test et dans train
y_test_pred = rf.predict(X_test)
",No,5,48.0
"mean_train = mean_squared_error( y_train_target, y_train_pred)
#mean_test = mean_squared_error(y_test, y_test_pred)
mean_train
#, mean_test",No,5,28.0
"submission = pd.read_csv(""../input/sampleSubmission.csv"")
submission[""count""] = y_test_pred
#Convertir notre fichier en csv
submission.to_csv('submission.csv', index=False)'",Yes,3,45.0
submission.head(3),No,5,41.0
"import numpy as np               
import pandas as pd              
import seaborn as sns             
from scipy import stats          
import calendar
from datetime import datetime    
import matplotlib.pyplot as plt  
%matplotlib inline  ",Yes,4,22.0
"fig,axes = plt.subplots(2,2)
fig.set_size_inches(12,10)

sns.distplot(train['temp'],ax=axes[0,0])
sns.distplot(train['atemp'],ax=axes[0,1])
sns.distplot(train['humidity'],ax=axes[1,0])
sns.distplot(train['windspeed'],ax=axes[1,1])

axes[0,0].set(xlabel='temp',title='Distribtion of temp')
axes[0,1].set(xlabel='atemp',title='Distribtion of atemp')
axes[1,0].set(xlabel='humidity',title='Distribtion of humidity')
axes[1,1].set(xlabel='windspeed',title='Distribtion of windspeed')
",Yes,3,33.0
"train['datetime'] = pd.to_datetime(train['datetime'],errors='coerce')
train['date'] = train['datetime'].apply(lambda x: x.date())
train['year'] = train['datetime'].apply(lambda x: x.year)
train['month'] = train['datetime'].apply(lambda x: x.month)
train['weekday'] = train['datetime'].apply(lambda x: x.weekday())
train['hour'] = train['datetime'].apply(lambda x: x.hour).astype('int')",Yes,2,16.0
"dummies_month = pd.get_dummies(train['month'], prefix= 'month')
dummies_season = pd.get_dummies(train['season'], prefix= 'season')
dummies_weather = pd.get_dummies(train['weather'], prefix= 'weather')
dummies_year = pd.get_dummies(train['year'], prefix= 'year')

data=pd.concat([train,dummies_month,dummies_season,dummies_weather,dummies_year],axis=1)

yLabels=data['count']
dropFeatures = ['casual' , 'count' , 'datetime' , 
                'registered' , 'date' ,'season',
                'weather','month','year']

dataTrain = data.drop(dropFeatures,axis=1)",Yes,2,20.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from math import sqrt
import statsmodels.api as sm",No,5,22.0
"X_train_ca, X_valid_ca, y_train_ca, y_valid_ca = train_test_split(dataTrain, 
                                    train['casual'], test_size=0.3, random_state=42)
X_train_re, X_valid_re, y_train_re, y_valid_re = train_test_split(dataTrain, 
                                    train['registered'], test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(dataTrain, 
                                    train['count'], test_size=0.3, random_state=42)",No,5,13.0
"rfModel_ca = RandomForestRegressor(n_estimators=1000 , 
                                   oob_score=True, random_state = 42)
rfModel_ca.fit(X_train_ca , y_train_ca)

rfModel_re = RandomForestRegressor(n_estimators=1000 , 
                                   oob_score=True, random_state = 42)
rfModel_re.fit(X_train_re , y_train_re)

rfModel = RandomForestRegressor(n_estimators=1000 , 
                                oob_score=True, random_state = 42)
rfModel.fit(X_train , y_train)",Yes,2,4.0
"preds_train_ca = rfModel_ca.predict( X = X_train_ca)
rmsle_casual_train = sqrt(mean_squared_log_error(y_train_ca, preds_train_ca))

preds_valid_ca = rfModel_ca.predict( X = X_valid_ca)
rmsle_casual_valid = sqrt(mean_squared_log_error(y_valid_ca, preds_valid_ca))
print('Casual train rmsle : %.5f, valid rmsle : %.5f' %(rmsle_casual_train,
                                                        rmsle_casual_valid))

preds_train_re = rfModel_re.predict( X = X_train_re)
rmsle_registered_train = sqrt(mean_squared_log_error(y_train_re, preds_train_re))

preds_valid_re = rfModel_re.predict( X = X_valid_re)
rmsle_registered_valid = sqrt(mean_squared_log_error(y_valid_re, preds_valid_re))
print('Registered train rmsle : %.5f,valid rmsle : %.5f' %(rmsle_registered_train, 
                                                           rmsle_registered_valid))",Yes,3,27.0
"preds_train = rfModel.predict( X = X_train)
rmsle_count_train = sqrt(mean_squared_log_error(y_train, preds_train))

preds_valid = rfModel.predict( X = X_valid)
rmsle_count_valid = sqrt(mean_squared_log_error(y_valid, preds_valid))
print('Count train rmsle : %.5f, valid rmsle : %.5f'  %(rmsle_count_train,
                                                        rmsle_count_valid))",Yes,3,27.0
"preds_train_merge_count = preds_train_re + preds_train_ca
preds_valid_merge_count = preds_valid_re + preds_valid_ca
rmsle_merge_train = sqrt(mean_squared_log_error(y_train,preds_train_merge_count))
rmsle_merge_valid = sqrt(mean_squared_log_error(y_valid,preds_valid_merge_count))

print('(merge) Count train rmsle : %.5f, valid rmsle : %.5f'%(rmsle_merge_train,
                                                              rmsle_merge_valid))",Yes,2,11.0
"test['datetime'] = pd.to_datetime(test['datetime'],errors='coerce')
test['date'] = test['datetime'].apply(lambda x: x.date())
test['year'] = test['datetime'].apply(lambda x: x.year)
test['month'] = test['datetime'].apply(lambda x: x.month)
test['weekday'] = test['datetime'].apply(lambda x: x.weekday())
test['hour'] = test['datetime'].apply(lambda x: x.hour).astype('int')",Yes,2,16.0
"dummies_month = pd.get_dummies(test['month'], prefix= 'month')
dummies_season = pd.get_dummies(test['season'], prefix= 'season')
dummies_weather = pd.get_dummies(test['weather'], prefix= 'weather')
dummies_year = pd.get_dummies(test['year'], prefix= 'year')

data_test=pd.concat([test,dummies_month,dummies_season,dummies_weather,dummies_year],axis=1)
datetimecol = test['datetime']
yLabels=data['count']
dropFeatures = ['datetime' , 'date' ,'season',
                'weather','month','year']

dataTest = data_test.drop(dropFeatures,axis=1)",Yes,2,20.0
"predsTest= rfModel_re.predict(X = dataTest) + rfModel_ca.predict(X = dataTest)
submission=pd.DataFrame({'datetime':datetimecol , 'count':[max(0,x) for x in predsTest]})",Yes,2,48.0
"submission.to_csv('sampleSubmission.csv',index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

import os
print(os.listdir(""../input""))'",Yes,4,22.0
"bike_df = pd.read_csv('../input/train.csv')
bike_df.shape",Yes,2,45.0
bike_df.info(),No,5,40.0
bike_df.head(),No,5,41.0
"bike_df.drop(['datetime', 'casual', 'registered'], axis=1, inplace=True)",No,5,10.0
y_target.hist(),No,5,33.0
"y_log_transform = np.log1p(y_target)
y_log_transform.hist()",Yes,3,33.0
"coef = pd.Series(lr_reg.coef_, index=X_features.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)",Yes,3,9.0
"X_features_ohe = pd.get_dummies(X_features, columns=['year', 'month', 'hour', 'holiday', 'workingday', 'season', 'weather'])",No,5,20.0
X_features_ohe.head(),No,5,41.0
"coef = pd.Series(lr_reg.coef_, index=X_features_ohe.columns)
coef_sort = coef.sort_values(ascending=False)[:15]
sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0
"X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size=0.3, random_state=2019)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg = GradientBoostingRegressor(n_estimators=500)
xgb_reg = XGBRegressor(n_estimators=500)
lgbm_reg = LGBMRegressor(n_estimators=500)

for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:
    get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=True)",No,3,4.0
submission = pd.read_csv('../input/sampleSubmission.csv'),No,5,45.0
"X_test = pd.read_csv('../input/test.csv')
X_test.head()",Yes,3,45.0
"X_test.drop(['datetime'], axis=1, inplace=True)
X_test.head()",Yes,2,10.0
"X_test_ohe = pd.get_dummies(X_test, columns=['year', 'month', 'hour', 'holiday', 'workingday', 'season', 'weather'])
X_test_ohe.head()",Yes,2,20.0
prediction = lgbm_reg.predict(X_test_ohe),No,5,48.0
"submission.to_csv('./My_submission.csv', index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import calendar",Yes,4,22.0
"from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
",No,5,22.0
"from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor",No,5,22.0
"import warnings
warnings.filterwarnings(""ignore"")",Yes,4,22.0
"train_data = pd.read_csv(path_in + 'train.csv', parse_dates = ['datetime'],
                         index_col='datetime', infer_datetime_format=True)
test_data = pd.read_csv(path_in + 'test.csv', parse_dates = ['datetime'],
                        index_col='datetime', infer_datetime_format=True)
samp_subm = pd.read_csv(path_in+'sampleSubmission.csv', parse_dates = ['datetime'],
                        index_col='datetime', infer_datetime_format=True)",No,5,45.0
"# Parameters
num_months_per_year = 12
year_list = [2011, 2012]",No,5,77.0
"month = 5
year = 2011
start_date = datetime.datetime(year, month, 1, 0, 0, 0)
end_date = datetime.datetime(year, month, 19, 23, 0, 0)
# train_data['count_log'] = np.log1p(train_data['count'])
# train_data['rolling_mean'] = train_data['count'].rolling(window = 24).mean()
# train_data['rolling_std'] = train_data['count'].rolling(window = 24).std()
",Yes,2,77.0
"train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)",No,5,20.0
"scaler = MinMaxScaler()
train_data[scale_features] = scaler.fit_transform(train_data[scale_features])
test_data[scale_features] = scaler.transform(test_data[scale_features])",Yes,2,4.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",Yes,5,88.0
"# Additional Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Dropout
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from math import sqrt",No,5,22.0
"b""def define_data(data , info=True ,shape = True, percentage =True,describe = True , sample=True , columns = False):\n    if columns == True:\n        print('\\nColumns of Data...')\n        print(data.columns)\n        return \n    if shape ==True:\n        print('Shape of Data is...')\n        print(data.shape)\n    if info==True:\n        print('\\nInfo of Data...')\n        print(data.info())\n    if percentage ==True:\n        print('\\nPercentage of Data Missing ...')\n        print((data.isnull().sum()/data.shape[0])*100)\n    if describe == True:\n        print('\\nDescription of data...')\n        display(data.describe())\n    if sample == True:\n        print('\\nSample of Data...')\n        display(data.sample(10).T)\n    \n\ndefine_data(train)""",No,4,40.0
"define_data(train  , columns = True)
define_data(test  , columns = True)",No,4,40.0
"# Divide DateTime Column to various Columns
def add_dates(data , column , suffix='time_' , year = True , month = True , day = False ,dayofweek = True, hour = True , minute = False  , second = False , date = False , time = False):
    data['add_date_date_time'] = pd.to_datetime(data[column])
    if year == True:
        data[suffix+'year']=data['add_date_date_time'].dt.year
    if month == True:
        data[suffix+'month']=data['add_date_date_time'].dt.month
    if day == True:
        data[suffix+'day']=data['add_date_date_time'].dt.day
    if hour == True:
        data[suffix+'hour']=data['add_date_date_time'].dt.hour
    if minute == True: 
        data[suffix+'minute']=data['add_date_date_time'].dt.minute
    if date == True:
        data[suffix+'date']=data['add_date_date_time'].dt.date
    if time == True:
        data[suffix+'time']=data['add_date_date_time'].dt.time
    if second == True:
        data[suffix+'second']=data['add_date_date_time'].dt.second
    if dayofweek == True:
        data[suffix+'dayofweek']=data['add_date_date_time'].dt.dayofweek
    data = data.drop(columns = ['add_date_date_time'] , axis =1)
    return data
train = add_dates(train , column = 'datetime') 
define_data(train , columns = True)",No,5,16.0
"b""def unique_count(data , columns = []):\n    for col in columns :\n        print('Unique Data Percentage in ',col)\n        print((data[col].value_counts()/data.shape[0])*100)\n        print('\\n')\nunique_count(train , columns = ['season','weather','time_year', 'time_dayofweek'])""",No,4,54.0
"b""def display_unique_data(data):\n    for i in data.columns:\n        unique_cols_data = data[i].unique()\n        if len(unique_cols_data)<20:\n            print('Correct Type on Column -> ',i)\n            print('Unique data in this Column is -> ',unique_cols_data)\n            print('\\n')\ndisplay_unique_data(train)""",No,5,57.0
"display(train.corr().style.format(""{:.2%}"").highlight_min())
# f,ax = plt.subplots(figsize=(15, 15))
# sns.heatmap(train.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)'",No,4,80.0
"def joint_plots(data , col,columns  = []):
    plt.figure(figsize=(16,16))
    for i in columns:
        sns.jointplot( x=col , y=i , data=data , height=10, ratio=3 , color='g')
        plt.show()
joint_plots(train , columns = ['temp' , 'atemp'  ,'humidity' , 'windspeed' ] , col = 'count')",No,5,33.0
"def plot_bar(data, col ,  feature=[]):
    length = len(feature)*4
    plt.figure(figsize=(20,length))
    for i,j in zip(feature,range(1,len(feature)*2-1,2)):
        plt.subplot(10,2,j)
        #fig = plt.figure(figsize=(9,8))
        sns.barplot(x=i, y=col, data=data, palette='Set2',orient='v')
        plt.plot()
        plt.subplot(10,2,j+1)
        sns.boxplot(x=i, y=col, data=data, palette='Set2'  , width=.4)
        plt.plot()",No,5,33.0
"plot_bar(train, col = 'count',feature =['time_hour','time_month','time_dayofweek','time_year','weather', 'holiday' , 'workingday' , 'season' ])",No,5,33.0
"b""def new_col_categorical(data , columns = [] , remove_original = True):\n    for i in columns:\n        unique_cols = data[i].unique()\n        if len(unique_cols) < 20:\n            print('\\nCorrect Type on Column -> ',i)\n            print('Unique data in this Column is -> ',unique_cols)\n        else:\n            return data\n    if remove_original == False:\n        original_data = data[columns]\n    data = pd.get_dummies(data , columns = columns)\n    if remove_original == False:\n        data = pd.concat([data,original_data] , axis=1)\n    return data\n        """,No,3,20.0
"train = new_col_categorical(train,columns=['season','weather','time_year', 'time_dayofweek' , 'time_month','time_hour_group'] , remove_original = False)",No,5,8.0
"define_data(train, columns = True )
# train_x_new = train.drop(columns =['datetime','count', 'season_1','casual','registered',
#        'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
#        'weather_3', 'weather_4', 'time_year_2011', 'time_year_2012',
#        'time_dayofweek_0', 'time_dayofweek_1', 'time_dayofweek_2',
#        'time_dayofweek_3', 'time_dayofweek_4', 'time_dayofweek_5',
#        'time_dayofweek_6', 'time_month_1', 'time_month_2', 'time_month_3',
#        'time_month_4', 'time_month_5', 'time_month_6', 'time_month_7',
#        'time_month_8', 'time_month_9', 'time_month_10', 'time_month_11',
#        'time_month_12'] , axis = 1)
train_x_new = train.drop(columns =['datetime','count', 'casual','registered',
       'season', 'weather', 'time_year',
       'time_dayofweek', 'time_month','time_hour_group'] , axis = 1)
train_y_new = train['count']
define_data(train_x_new, columns = True )",Yes,5,21.0
"# Processing Test Data
test = add_dates(test , column = 'datetime') 
test['time_hour_group'] = test['time_hour'].apply(hour_group).astype(str)
test = new_col_categorical(test,columns=['season','weather','time_year', 'time_dayofweek' , 'time_month','time_hour_group'] , remove_original = False)
test['weekend'] = test['time_dayofweek_5']+test['time_dayofweek_6'] 
# test_x_new = test.drop(columns =['datetime', 'season_1',
#        'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
#        'weather_3', 'weather_4', 'time_year_2011', 'time_year_2012',
#        'time_dayofweek_0', 'time_dayofweek_1', 'time_dayofweek_2',
#        'time_dayofweek_3', 'time_dayofweek_4', 'time_dayofweek_5',
#        'time_dayofweek_6', 'time_month_1', 'time_month_2', 'time_month_3',
#        'time_month_4', 'time_month_5', 'time_month_6', 'time_month_7',
#        'time_month_8', 'time_month_9', 'time_month_10', 'time_month_11',
#        'time_month_12'] , axis = 1)

test_x_new = test.drop(columns =['datetime',
       'season', 'weather', 'time_year',
       'time_dayofweek', 'time_month','time_hour_group'] , axis = 1)",No,3,8.0
"print('For Train Data .. ')
define_data(train_x_new, columns = True )
print('For Test Data .. ')
define_data(test_x_new , columns = True )",No,5,40.0
"scaler = MinMaxScaler()
train_x_new = scaler.fit_transform(train_x_new)
train_y_new = np.log1p(train_y_new)",Yes,5,18.0
"test_x_new_1 = scaler.transform(test_x_new)
test_x_new_2 = scaler.fit_transform(test_x_new)",No,5,18.0
"X_train , X_test , Y_train , Y_test = train_test_split(train_x_new , train_y_new , test_size = .15 , random_state = 65 )",No,5,13.0
"valid_0_error =0
valid_1_error =0",No,5,77.0
"%%time
params = {
        ""objective"" : ""regression"", 
        ""metric"" : ""mae"", 
        ""num_leaves"" : 60, 
        ""learning_rate"" : 0.01, 
        ""bagging_fraction"" : 0.9,
        ""bagging_seed"" : 0, 
        ""num_threads"" : 4,
        ""colsample_bytree"" : 0.5, 
        'lambda_l2':9
}

model = lgb.train(  params, 
                    train_set = train_set,
                    num_boost_round=10000,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=[train_set,val_set]
                  )

'",No,4,7.0
"%%time
lgb_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
lgb_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
lgb_pred_normal = model.predict(test_x_new_1, num_iteration=model.best_iteration)
lgb_pred_fit = model.predict(test_x_new_2, num_iteration=model.best_iteration)",Yes,5,48.0
"# print(lgb_pred)
# print(np.array(Y_test))
valid_0_error_new = sqrt(mean_squared_error(np.array(Y_train),lgb_pred_train))
valid_1_error_new = sqrt(mean_squared_error(np.array(Y_test),lgb_pred_test))
score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new)
valid_0_error = valid_0_error_new
valid_1_error = valid_1_error_new",No,3,49.0
lgb.plot_importance(model),No,5,79.0
"%%time
n_estimators=100
xgb = XGBRegressor(n_estimators=n_estimators,max_depth=4,learning_rate =0.01 , booster = 'gbtree')
xgb.fit(X_train ,Y_train ,eval_set=[(X_train, Y_train), (X_test, Y_test)] , verbose = False)
score = xgb.evals_result()
valid_0_error_new = np.amin(score['validation_0']['rmse'])
valid_1_error_new = np.amin(score['validation_1']['rmse'])
score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new)
valid_0_error = valid_0_error_new
valid_1_error = valid_1_error_new",Yes,3,7.0
"%%time
model = RandomForestRegressor(random_state=65, n_estimators=200, min_samples_split=4)
result = model.fit(X_train, Y_train)",Yes,5,7.0
"model.score(X_test, Y_test)",No,5,49.0
"%%time
n_estimators=3000
xgb = XGBRegressor(n_estimators=n_estimators,max_depth=4,learning_rate =0.01 , booster = 'gbtree')
xgb.fit(train_x_new , train_y_new ,eval_set=[(X_train, Y_train), (X_test, Y_test)] , verbose = False)
pred_normal = xgb.predict(test_x_new_1)
pred_fit = xgb.predict(test_x_new_2)",Yes,4,7.0
"%%time
model = RandomForestRegressor(random_state=65, n_estimators=n_estimators-2000)
model.fit(train_x_new , train_y_new)
rfr_pred_normal = model.predict(test_x_new_1)
rfr_pred_fit = model.predict(test_x_new_2)",Yes,3,7.0
"%%time
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = train_x_new.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(0.3))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(0.3))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(0.3))
# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

history = NN_model.fit(train_x_new,train_y_new, epochs=50, batch_size=64,  verbose=1, validation_split=0.2)
start_point = 150-100
r = range(start_point,150)
plt.figure(figsize=(16,8))
plt.plot( history.history['loss']  ,'r' ,label ='Train')
plt.plot( history.history['val_loss']  , 'g' , label = 'Test' )
plt.legend(fontsize='x-large')
valid_0_error_new = history.history['loss'][-1]
valid_1_error_new = history.history['val_loss'][-1]
score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new)
valid_0_error = valid_0_error_new
valid_1_error = valid_1_error_new
",Yes,5,84.0
"ANN_pred_normal = NN_model.predict(test_x_new_1)
ANN_pred_fit = NN_model.predict(test_x_new_2)
ANN_pred_normal = np.expm1(ANN_pred_normal)
ANN_pred_fit = np.expm1(ANN_pred_fit)
ANN_pred_fit = ANN_pred_fit.reshape(6493)
ANN_pred_normal = ANN_pred_normal.reshape(6493)",No,4,48.0
"output = pd.DataFrame({'datetime': test.datetime,'count': pred_normal})
output.to_csv('xgb_pred_normal.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': pred_fit})
output.to_csv('xgb_pred_fit.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': rfr_pred_normal})
output.to_csv('rfr_pred_normal.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': rfr_pred_fit})
output.to_csv('rfr_pred_fit.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': ANN_pred_normal})
output.to_csv('ANN_pred_normal.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': ANN_pred_fit})
output.to_csv('ANN_pred_fit.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': lgb_pred_normal})
output.to_csv('lgb_pred_normal.csv', index=False)
output = pd.DataFrame({'datetime': test.datetime,'count': lgb_pred_fit})
output.to_csv('lgb_pred_fit.csv', index=False)",No,5,25.0
"train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""])
test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""])

train[""year""] = train[""datetime""].dt.year
train[""hour""] = train[""datetime""].dt.hour
train[""dayofweek""] = train[""datetime""].dt.dayofweek

test[""year""] = test[""datetime""].dt.year
test[""hour""] = test[""datetime""].dt.hour
test[""dayofweek""] = test[""datetime""].dt.dayofweek",No,3,45.0
"train.info() # train.shape, train.isnull().sum(), train.dtypes   '",No,5,40.0
"train.describe() # , ,    '",No,5,40.0
"b""train['temp'].value_counts().sort_index() #      (binning)""",No,5,72.0
"import seaborn as sns
import matplotlib.pylab as plt

_, axes = plt.subplots(1,1, figsize = (20,12))
sns.boxplot(x=train[""hour""], y=train[""count""])",Yes,5,75.0
"fig, axes = plt.subplots(3,1, figsize = (20,12))

sns.countplot(train[""season""], ax = axes[0], palette=""Set1"")
sns.countplot(train[""weather""], ax = axes[1], palette=""Set1"")
sns.countplot(train[""windspeed""], ax = axes[2])",No,5,33.0
"fig, axes = plt.subplots(3,1, figsize = (20,12))

sns.countplot(train[""season""], ax = axes[0], palette=""Set1"")
sns.countplot(train[""weather""], ax = axes[1], palette=""Set1"")
sns.countplot(train[""windspeed""], ax = axes[2])
plt.xticks(rotation = 60, )",No,5,33.0
"y_casual = np.log1p(train.casual)
y_registered = np.log1p(train.registered)
#y_train = np.log1p(train[""count""])

train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True)
test.drop([""datetime"", ""windspeed"", ], 1, inplace=True)",Yes,5,21.0
"import lightgbm as lgb
hyperparameters = { 'colsample_bytree': 0.725,  'learning_rate': 0.013,
                    'num_leaves': 56, 'reg_alpha': 0.754, 'reg_lambda': 0.071, 
                    'subsample': 0.523, 'n_estimators': 1093}
model = lgb.LGBMRegressor(**hyperparameters)
model.fit(train, y_casual)
preds1 = model.predict(test)

hyperparameters = { 'colsample_bytree': 0.639,  'learning_rate': 0.011,
                    'num_leaves': 30, 'reg_alpha': 0.351, 'reg_lambda': 0.587,
                   'subsample': 0.916, 'n_estimators': 2166}
model = lgb.LGBMRegressor(**hyperparameters, )
model.fit(train, y_registered)
preds2 = model.predict(test)

submission=pd.read_csv(""../input/sampleSubmission.csv"")
submission[""count""] = np.expm1(preds1) + np.expm1(preds2)
#submission.to_csv(""allrf.csv"", index=False)'",Yes,3,7.0
submission.iloc[6332:6354],No,5,14.0
"submission.to_csv(""lgb.csv"", index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats

import missingno as msno
plt.style.use('seaborn')

import warnings
warnings.filterwarnings(""ignore"")

mpl.rcParams['axes.unicode_minus'] = False

%matplotlib inline

#   import . 
# mlp.rcParams['axes.unicode_minus'] = False        .",Yes,5,23.0
os.listdir('../input/'),No,4,88.0
"df_train = pd.read_csv(""../input/bike-sharing-demand/train.csv"", parse_dates = [""datetime""])
df_test = pd.read_csv(""../input/bike-sharing-demand/test.csv"", parse_dates = [""datetime""])",No,5,45.0
"b""for col in df_train.columns:\n    msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))\n    print(msperc)""",No,5,39.0
"b""for col in df_test.columns:\n    msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))\n    print(msperc)""",No,5,39.0
"msno.matrix(df_train, figsize=(12,5))",No,5,34.0
"df_train[""year""] = df_train[""datetime""].dt.year
df_train[""month""] = df_train[""datetime""].dt.month
df_train[""day""] = df_train[""datetime""].dt.day
df_train[""hour""] = df_train[""datetime""].dt.hour
df_train[""minute""] = df_train[""datetime""].dt.minute
df_train[""second""] = df_train[""datetime""].dt.second

df_test[""year""] = df_test[""datetime""].dt.year
df_test[""month""] = df_test[""datetime""].dt.month
df_test[""day""] = df_test[""datetime""].dt.day
df_test[""hour""] = df_test[""datetime""].dt.hour
df_test[""minute""] = df_test[""datetime""].dt.minute
df_test[""second""] = df_test[""datetime""].dt.second

df_train.shape

# datetime    ,,,,,  .
#  column 18    .",Yes,4,8.0
"figure, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(nrows = 2, ncols = 3)
figure.set_size_inches(18,10)

sns.barplot(data=df_train, x = ""year"", y = ""count"", ax = ax1)
sns.barplot(data=df_train, x = ""month"", y = ""count"", ax = ax2)
sns.barplot(data=df_train, x = ""day"", y = ""count"", ax = ax3)
sns.barplot(data=df_train, x = ""hour"", y = ""count"", ax = ax4)
sns.barplot(data=df_train, x = ""minute"", y = ""count"", ax = ax5)
sns.barplot(data=df_train, x = ""second"", y = ""count"", ax = ax6)

ax1.set(ylabel = ""count"", title = ""Rental amount by year"")
ax2.set(ylabel = ""count"", title = ""Rental amount by month"")
ax3.set(ylabel = ""count"", title = ""Rental amount by day"")
ax4.set(ylabel = ""count"", title = ""Rental amount by hour"")

#       barplot   .
#    11 12     . 
#         .
#   1~19  .  test    .
#   8 5, 6  ,      .",No,5,75.0
"df_train[""dayofweek""] = df_train[""datetime""].dt.dayofweek
df_test[""dayofweek""] = df_test[""datetime""].dt.dayofweek
df_train.shape

#    . 
#   column  19 .",Yes,4,8.0
"df_train[""dayofweek""].value_counts()

# 0~6   . 
# 0 =  ~ 6 = .
# 5,6(, )        workingday Boxplot     .",No,5,72.0
"fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows = 5)
fig.set_size_inches(18,25)

sns.pointplot(data = df_train, x = ""hour"", y = ""count"", ax = ax1)
sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""workingday"", ax = ax2)
sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""dayofweek"", ax = ax3)
sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""weather"", ax = ax4)
sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""season"",  ax = ax5)

#    pointplot .
#  plot     .
#  plot     ,      . 
#  plot    plot  ,   ,     .
#  plot       ,   ,      .      .
#  plot   ,        .",No,5,75.0
"corr_data = df_train[[""temp"", ""atemp"", ""casual"", ""registered"", ""humidity"", ""windspeed"", ""count""]]

colormap = plt.cm.PuBu
 
f , ax = plt.subplots(figsize = (12,10))
plt.title('Correlation of Numeric Features with Rental Count',y=1,size=18)
sns.heatmap(corr_data.corr(), vmax=.8, linewidths=0.1,square=True,annot=True,cmap=colormap,
            linecolor=""white"",annot_kws = {'size':14})

#     Heatmap .
# count        registered.   test  .
#      casual.
# , ,       .
# temp atemp ,          .",No,5,80.0
"b""fig, (ax1, ax2, ax3) = plt.subplots(ncols = 3, figsize=(12,5))\n\ntemp_scatter_plot = pd.concat([df_train['count'],df_train['temp']],axis = 1)\nsns.regplot(x='temp',y = 'count',data = temp_scatter_plot,scatter= True, fit_reg=True, ax=ax1)\nwindspeed_scatter_plot = pd.concat([df_train['count'],df_train['windspeed']],axis = 1)\nsns.regplot(x='windspeed',y = 'count',data = windspeed_scatter_plot,scatter= True, fit_reg=True, ax=ax2)\nhumidity_scatter_plot = pd.concat([df_train['count'],df_train['humidity']],axis = 1)\nsns.regplot(x='humidity',y = 'count',data = humidity_scatter_plot,scatter= True, fit_reg=True, ax=ax3)\n\n#      Scatterplot .\n# windspeed  0   . \n#   0    Null 0     . """,No,5,33.0
"fig, axes = plt.subplots(nrows = 2, figsize = (18,14))

plt.sca(axes[0])
plt.xticks(rotation = 30, ha = ""right"")
axes[0].set(ylabel = ""count"", title = ""train windspeed"")
sns.countplot(data = df_train, x = ""windspeed"", ax = axes[0])

plt.sca(axes[1])
plt.xticks(rotation = 30, ha = ""right"")
axes[1].set(ylabel = ""count"", title = ""test windspeed"")
sns.countplot(data = df_test, x = ""windspeed"", ax = axes[1])

#     .
# 0  .
# Feature engineering  .",No,5,33.0
"def concatenate_year_month(datetime):
    return ""{0}-{1}"".format(datetime.year, datetime.month)

df_train[""year_month""] = df_train[""datetime""].apply(concatenate_year_month)
df_test[""year_month""] = df_test[""datetime""].apply(concatenate_year_month) 
    
print(df_train.shape)
df_train[[""datetime"", ""year_month""]].head()

#        year month  . '",Yes,4,8.0
"fig, ax = plt.subplots(figsize = (18,4))

sns.barplot(data = df_train, y = ""count"", x = ""year_month"")

# 2011  2012   ,        .
#  2012   ,        .
#      .",No,5,75.0
df_train.loc[Outliers_to_drop],No,5,14.0
"df_train = df_train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
df_train.shape",Yes,4,10.0
"df_train_num = df_train[[""count"", ""temp"", ""atemp"", ""casual"", ""registered"", ""humidity"", ""windspeed""]]

for col in df_train_num:
    print('{:15}'.format(col), 
          'Skewness: {:05.2f}'.format(df_train[col].skew()) , 
          '   ' ,
          'Kurtosis: {:06.2f}'.format(df_train[col].kurt())  
         )

#   '",No,5,40.0
"fig, axes = plt.subplots(nrows = 5, ncols = 2, figsize=(16, 18))
sns.boxplot(data = df_train, y=""count"", x = ""season"", orient = ""v"", ax = axes[0][0])
sns.boxplot(data = df_train, y=""count"", x = ""holiday"", orient = ""v"", ax = axes[0][1])
sns.boxplot(data = df_train, y=""count"", x = ""workingday"", orient = ""v"", ax = axes[1][0])
sns.boxplot(data = df_train, y=""count"", x = ""weather"", orient = ""v"", ax = axes[1][1])
sns.boxplot(data = df_train, y=""count"", x = ""dayofweek"", orient = ""v"", ax = axes[2][0])
sns.boxplot(data = df_train, y=""count"", x = ""month"", orient = ""v"", ax = axes[2][1])
sns.boxplot(data = df_train, y=""count"", x = ""year"", orient = ""v"", ax = axes[3][0])
sns.boxplot(data = df_train, y=""count"", x = ""hour"", orient = ""v"", ax = axes[3][1])
sns.boxplot(data = df_train, y=""count"", x = ""minute"", orient = ""v"", ax = axes[4][0])

axes[0][0].set(ylabel = ""count"", title = ""Rental count by season"")
axes[0][1].set(ylabel = ""count"", title = ""Rental count by holiday"")
axes[1][0].set(ylabel = ""count"", title = ""Rental count by workingday"")
axes[1][1].set(ylabel = ""count"", title = ""Rental count by weather"")
axes[2][0].set(ylabel = ""count"", title = ""Rental count by dayofweek"")
axes[2][1].set(ylabel = ""count"", title = ""Rental count by month"")
axes[3][0].set(ylabel = ""count"", title = ""Rental count by year"")
axes[3][1].set(ylabel = ""count"", title = ""Rental count by hour"")
axes[4][0].set(ylabel = ""count"", title = ""Rental count by minute"")",No,5,33.0
"f, ax = plt.subplots(1, 1, figsize = (10,6))
g = sns.distplot(df_train[""count""], color = ""b"", label=""Skewness: {:2f}"".format(df_train[""count""].skew()), ax=ax)
g = g.legend(loc = ""best"")

print(""Skewness: %f"" % df_train[""count""].skew())
print(""Kurtosis: %f"" % df_train[""count""].kurt())

#   '",Yes,5,40.0
"df_train[""count_Log""] = df_train[""count""].map(lambda i:np.log(i) if i>0 else 0)

f, ax = plt.subplots(1, 1, figsize = (10,6))
g = sns.distplot(df_train[""count_Log""], color = ""b"", label=""Skewness: {:2f}"".format(df_train[""count_Log""].skew()), ax=ax)
g = g.legend(loc = ""best"")

print(""Skewness: %f"" % df_train['count_Log'].skew())
print(""Kurtosis: %f"" % df_train['count_Log'].kurt())

df_train.drop('count', axis= 1, inplace=True)

#    .",Yes,4,20.0
"trainWind0 = df_train.loc[df_train[""windspeed""] == 0]
trainWindNot0 = df_train.loc[df_train[""windspeed""] != 0]

#   0    0    .",No,5,14.0
"from sklearn.ensemble import RandomForestClassifier
# RandomForest   .
def predict_windspeed(data):
    dataWind0 = data.loc[data[""windspeed""] == 0]
    dataWindNot0 = data.loc[data[""windspeed""] != 0]
    #   0    .
    wcol = [""season"", ""weather"", ""humidity"", ""day"", ""temp"", ""atemp""]
    #    .
    dataWindNot0[""windspeed""] = dataWindNot0[""windspeed""].astype(""str"")
    #  0   string .
    rf_wind = RandomForestClassifier()
    rf_wind.fit(dataWindNot0[wcol], dataWindNot0[""windspeed""])
    wind0 = rf_wind.predict(X=dataWind0[wcol])
    #   wcol  0   .
    
    predictWind0 = dataWind0
    predictWindNot0 = dataWindNot0
    #     .
    predictWind0[""windspeed""] = wind0
    #  .
    data = predictWindNot0.append(predictWind0)
    # 0    .
    data[""windspeed""] = data[""windspeed""].astype(""float"")
    #    float  .
    data.reset_index(inplace = True)
    data.drop(""index"", inplace = True, axis = 1)
    
    return data
    '",Yes,2,7.0
"df_train = predict_windspeed(df_train)
df_test = predict_windspeed(df_test)

fig, (ax1, ax2) = plt.subplots(nrows = 2, figsize = (18,14))

plt.sca(ax1)
plt.xticks(rotation = 30, ha = ""right"")
ax1.set(ylabel = ""count"", title = ""train windspeed"")
sns.countplot(data = df_train, x = ""windspeed"", ax = ax1)

plt.sca(ax2)
plt.xticks(rotation = 30, ha = ""right"")
ax1.set(ylabel = ""count"", title = ""test windspeed"")
sns.countplot(data = df_test, x = ""windspeed"", ax = ax2)

#   , rotation    
'",Yes,4,33.0
"df_train = pd.get_dummies(df_train, columns = [""weather""], prefix = ""weather"")
df_test = pd.get_dummies(df_test, columns = [""weather""], prefix = ""weather"")

df_train = pd.get_dummies(df_train, columns = [""season""], prefix = ""season"")
df_test = pd.get_dummies(df_test, columns = [""season""], prefix = ""season"")

#onehotencoding",No,5,20.0
"corr_data = df_train[[""count_Log"", ""windspeed""]]
corr_data.corr()",No,5,40.0
"datetime_test = df_test['datetime']

df_train.drop([""datetime"", ""registered"",""casual"",""holiday"", ""year_month"", ""minute"", ""second""], axis = 1, inplace = True)
df_test.drop([""datetime"",""holiday"", ""year_month"", ""minute"", ""second""], axis = 1, inplace = True)
'",No,5,10.0
df_test.head(20),No,5,41.0
"from sklearn.model_selection import train_test_split 
from sklearn import metrics 
X_train = df_train.drop(""count_Log"", axis = 1).values 
target_label = df_train[""count_Log""].values 
X_test = df_test.values 
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size = 0.2, random_state = 2000)",No,4,21.0
"from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05,
                                   max_depth=4, 
                                   min_samples_leaf=15, min_samples_split=10, random_state =42) 

regressor.fit(X_tr,y_tr)",Yes,5,7.0
"y_hat = regressor.predict(X_tr)
plt.scatter(y_tr, y_hat, alpha = 0.2)
plt.xlabel('Targets (y_tr)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
plt.show()",Yes,5,56.0
"y_hat_test = regressor.predict(X_vld)
plt.scatter(y_vld, y_hat_test, alpha=0.2)
plt.xlabel('Targets (y_vld)',size=18)
plt.ylabel('Predictions (y_hat_test)',size=18)
plt.show()",Yes,5,56.0
"from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

models=[GradientBoostingRegressor()]
model_names=['regressor']
rmsle=[]
d={}
for model in range (len(models)):
    clf=models[model]
    clf.fit(X_tr,y_tr)
    test_pred=clf.predict(X_vld)
    rmsle.append(np.sqrt(mean_squared_log_error(test_pred,y_vld)))
d={'Modelling Algo':model_names,'RMSLE':rmsle}   
d",Yes,5,4.0
"from sklearn.model_selection import cross_val_score 
accuracies = cross_val_score(estimator = regressor, X = X_tr, y = y_tr, cv = 8)",Yes,5,84.0
"use_logvals = 1 

pred_xgb = regressor.predict(X_test) 

sub_xgb = pd.DataFrame() 
sub_xgb['datetime'] = datetime_test 
sub_xgb['count'] = pred_xgb 
if use_logvals == 1:
    sub_xgb['count'] = np.exp(sub_xgb['count'])
    
sub_xgb.to_csv('xgb.csv',index=False)",Yes,4,55.0
"# read data (train, test) with pd.read_csv(directory)
train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")
train.head(10)
#train.info()
#train.shape",Yes,4,45.0
"test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"")
test.head(10)",Yes,4,45.0
"b""y = train['count']\n# y   . log scaling outlier    .\n#   MSE(Mean Square Error) //   900 100   800^2 = 6400 ..\n#    ,  outlier    log sacling  .\ny.sort_values()""",No,3,41.0
"# y .
import matplotlib.pyplot as plt
import seaborn as sns
#   
wg, dh =  plt.subplots(2,1, figsize=(20,12))
# log scaling   .
sns.distplot(y, ax=dh[0])
# log     .
sns.distplot(np.log(y), ax=dh[1])
'",No,3,41.0
"b""#     ,       .\n# y x  .  train test    .\n# y = train['count']\n# log scaling y outlier   .\ny = np.log(train['count'])\ny #    .  .""",No,5,8.0
"##############    .
#   3   .
train['datetime'] = train['datetime'].astype('datetime64')
# train['datetime'] = pd.to_datetime(train['datetime'])
# train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates = ['datetime'])
train.dtypes
'",Yes,4,16.0
"test['datetime'] = test['datetime'].astype('datetime64')
# test['datetime'] = pd.to_datetime(test['datetime'])
# train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates = ['datetime'])
test.dtypes'",Yes,4,16.0
"train['year'] = train['datetime'].dt.year
train['weekday'] = train['datetime'].dt.weekday
train['hour'] = train['datetime'].dt.hour
train.head()",Yes,5,8.0
"test['year'] = test['datetime'].dt.year
test['weekday'] = test['datetime'].dt.weekday
test['hour'] = test['datetime'].dt.hour
test.head()",Yes,4,8.0
"train = train.drop(['datetime', 'casual', 'registered', 'count'], 1)
test = test.drop('datetime', 1)
",No,5,10.0
"from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(train, y)
preds = predict(test)",Yes,4,7.0
"sample = pd.read_csv(""/kaggle/input/bike-sharing-demand/sampleSubmission.csv"")
sample.head()",Yes,3,45.0
"b""#  np.log   train  ,     exp  .\nsample['count'] = np.exp(preds)\nsample.head()""",Yes,4,8.0
"sample.to_csv(""sample.csv"", index = False)
",No,5,25.0
"#################################################################################################
############################  EDA   INSIGHT ###############################
y.sort_values()'",No,3,41.0
"train2 = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates=['datetime'])
train2['year'] = train2['datetime'].dt.year
train2['month'] = train2['datetime'].dt.month
train2['day'] = train2['datetime'].dt.day
train2['weekday'] = train2['datetime'].dt.weekday
train2['hour'] = train2['datetime'].dt.hour

test2 = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv', parse_dates=['datetime'])
test2['day'] = test2['datetime'].dt.day

#  ()   ,       
# mean   . outlier , median 
#train2.groupby('hour')['count'].mean()
##### media          .        .
train2.groupby('hour')['count'].median()
'",Yes,3,8.0
"b""a, b = plt.subplots(2,2,figsize=(20,12))\nsns.boxplot(train2['year'], train2['count'], ax=b[0,1])\nsns.boxplot(train2['month'], train2['count'], ax=b[1,1])\n### day 1~19  .!\nsns.boxplot(train2['day'], train2['count'], ax=b[0,0])\n###    outlier ?  ( 5/ 2)  5    . \n###   count    outlier  .\nsns.boxplot(train2['hour'], train2['count'], ax=b[1,0])\n                                                  """,No,5,75.0
"b""#   class  .\ntrain2['datetime'].dt.month.value_counts()""",No,4,72.0
"#Let's import the usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline",Yes,5,23.0
"#Importing the dataset
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
data = train.append(test, sort = False)
data.head()",Yes,4,45.0
"#Histogram for count
sns.set_style('darkgrid')
sns.distplot(train['count'], bins = 100, color = 'green')
plt.show()",No,5,33.0
"#Boxplot for count
import matplotlib.pyplot as plt
sns.boxplot(x = 'count', data = train, color = 'mediumpurple')
plt.show()",No,5,33.0
"#Data without the outliers in count
data = data[~data.isin(outliers)]
data = data[data['datetime'].notnull()]",No,5,14.0
"sns.barplot(x = 'season', y = 'count', data = train, estimator = np.average, palette='coolwarm')
plt.ylabel('Average Count')
plt.show()",No,5,33.0
"sns.barplot(x = 'workingday', y = 'count', data = train, estimator = np.average, palette='colorblind')
plt.ylabel('Average Count')
plt.show()",No,5,33.0
"sns.barplot(x = 'weather', y = 'count', data = train, estimator = np.average, palette='deep')
plt.ylabel('Average Count')
plt.show() ",No,5,33.0
"plt.figure(figsize = (10,7))
tc = train.corr()
sns.heatmap(tc, annot = True, cmap = 'coolwarm', linecolor = 'white', linewidths=0.1)",No,5,80.0
"#Convert to integer variables
columns=['season', 'holiday', 'workingday', 'weather']
for i in columns:
    data[i] = data[i].apply(lambda x : int(x))",No,5,8.0
"#Convert string to datatime and create Hour, Month and Day of week
data['datetime'] = pd.to_datetime(data['datetime'])
data['Hour'] = data['datetime'].apply(lambda x:x.hour)
data['Month'] = data['datetime'].apply(lambda x:x.month)
data['Day of Week'] = data['datetime'].apply(lambda x:x.dayofweek)",No,4,8.0
"plt.figure(figsize = (8,4))
sns.lineplot(x = 'Month', y = 'count', data = data, estimator = np.average, hue = 'weather', palette = 'coolwarm')
plt.ylabel('Average Count')
plt.show()",No,5,75.0
data[data['weather'] == 4],No,5,14.0
"fig, axes = plt.subplots(ncols = 2, figsize = (15,5), sharey = True)
sns.pointplot(x = 'Hour', y = 'count', data = data, estimator = np.average, hue = 'workingday', ax = axes[0], palette = 'muted')
sns.pointplot(x = 'Hour', y = 'count', data = data, estimator = np.average, hue = 'holiday', ax = axes[1], palette = 'muted')
ax = [0,1]
for i in ax:
    axes[i].set(ylabel='Average Count')",No,5,75.0
"plt.figure(figsize = (10,4))
sns.pointplot(x = 'Hour', y = 'count', data = data, estimator=np.average, hue = 'Day of Week', palette='coolwarm')",No,5,75.0
"sns.jointplot(x = 'atemp', y = 'count', data = data, kind = 'kde', cmap = 'plasma')
plt.show()",No,3,33.0
"plt.figure(figsize = (8,4))
sns.pointplot(x = 'Hour', y = 'casual', data = data, estimator = np.average, color = 'blue')
sns.pointplot(x = 'Hour', y = 'registered', data = data, estimator = np.average, color = 'red')
plt.ylabel('Registered')
plt.show()",No,5,75.0
"#Histogram for Windspeed
sns.set_style('darkgrid')
sns.distplot(data['windspeed'], bins = 100, color = 'purple') #Windspeed cannot be 0.
plt.show()",No,5,33.0
"#Replacing 0s in windspeed with the mean value grouped by season
data['windspeed'] = data['windspeed'].replace(0, np.nan)
data['windspeed'] = data['windspeed'].fillna(data.groupby('weather')['season'].transform('mean'))
sns.distplot(data['windspeed'], bins = 100, color = 'red')
plt.show()",Yes,3,17.0
"#Encoding cyclical features
data['Month_sin'] = data['Month'].apply(lambda x: np.sin((2*np.pi*x)/12))
data['Month_cos'] = data['Month'].apply(lambda x: np.cos((2*np.pi*x)/12))
data['Hour_sin'] = data['Hour'].apply(lambda x: np.sin((2*np.pi*(x+1))/24))
data['Hour_cos'] = data['Hour'].apply(lambda x: np.cos((2*np.pi*(x+1))/24))
data['DayOfWeek_sin'] = data['Day of Week'].apply(lambda x: np.sin((2*np.pi*(x+1))/7))
data['DayOfWeek_cos'] = data['Day of Week'].apply(lambda x: np.cos((2*np.pi*(x+1))/7))",No,5,8.0
"#trainsforming target variable using log transformation
data['count'] = np.log(data['count'])",No,5,8.0
"#Converting Categorical to numerical - Removing Co-Linearity
data_ = pd.get_dummies(data=data, columns=['season', 'holiday', 'workingday', 'weather'])
train_ = data_[pd.notnull(data_['count'])].sort_values(by=[""datetime""])
test_ = data_[~pd.notnull(data_['count'])].sort_values(by=[""datetime""])'",No,4,9.0
"#Standardizing numerical variables
from sklearn.preprocessing import StandardScaler
cols = ['temp','atemp','humidity', 'windspeed', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'DayOfWeek_sin','DayOfWeek_cos']
features = data[cols]

#Standard Scaler
scaler = StandardScaler().fit(features.values)
data[cols] = scaler.transform(features.values)",Yes,5,18.0
"from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics",No,5,22.0
"#train test split
X = train_[cols]
y = train_['count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)",Yes,5,13.0
"lm = LinearRegression()
lm.fit(X_train, y_train)
print(lm.intercept_)",Yes,5,7.0
"plt.figure(figsize = (18,4))
coeff = pd.DataFrame(lm.coef_, index = X.columns, columns = ['Coefficient'])
sns.barplot(x = coeff.index, y = 'Coefficient', data = coeff, color = 'red')",No,5,79.0
"plt.figure(figsize = (8,4))
pred = lm.predict(X_test)
sns.scatterplot(x = y_test, y = pred)
plt.xlabel('Count')
plt.ylabel('Predictions')
plt.show()",No,5,56.0
"sns.distplot((y_test-pred),bins=100, color = 'gray')
plt.show()",No,4,33.0
"print('RMSLE:', np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))))",No,5,49.0
"from sklearn.linear_model import Ridge
#Assiging different sets of alpha values to explore which can be the best fit for the model. 
temp_msle = {}
for i in np.linspace(0, 40, 20):
    ridge = Ridge(alpha= i, normalize=True)
    #fit the model. 
    ridge.fit(X_train, y_train)
    ## Predicting the target value based on ""Test_x""
    pred = ridge.predict(X_test)

    msle = np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred)))
    temp_msle[i] = msle",Yes,5,2.0
"from sklearn.linear_model import Lasso
## Assiging different sets of alpha values to explore which can be the best fit for the model. 
temp_msle = {}
for i in np.logspace(-10, -1, 20):
    ## Assigin each model. 
    lasso = Lasso(alpha= i, normalize=True, tol = 0.1)
    ## fit the model. 
    lasso.fit(X_train, y_train)
    ## Predicting the target value based on ""Test_x""
    pred = lasso.predict(X_test)

    msle = np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred)))
    temp_msle[i] = msle",Yes,5,2.0
"from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 500)
rfr.fit(X_train, y_train)",Yes,5,7.0
"plt.figure(figsize = (8,4))
pred = rfr.predict(X_test)
sns.scatterplot(x = y_test, y = pred)
plt.xlabel('Count')
plt.ylabel('Predictions')
plt.show()",No,5,56.0
"sns.distplot((y_test-pred),bins=100, color = 'gray')",No,3,33.0
"#RMSLE
print('RMSLE:', np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))))",No,5,49.0
"#submission
new = test_[cols]
pred = rfr.predict(new)
submission = pd.DataFrame({'datetime':test['datetime'],'count':np.exp(pred)})
submission['count'] = submission['count'].astype(int)
submission.to_csv('submission.csv',index=False)",Yes,4,25.0
"b""import pandas as pd\nimport numpy as np\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#     \n%matplotlib inline\n\n#       \nmpl.rcParams['axes.unicode_minus']=False\n\nimport warnings\nwarnings.filterwarnings('ignore')""",Yes,5,23.0
"train=pd.read_csv(""../input/bike-sharing-demand/train.csv"", parse_dates=[""datetime""])
train.shape
",Yes,4,45.0
"test=pd.read_csv(""../input/bike-sharing-demand/test.csv"", parse_dates=[""datetime""])
test.shape",Yes,4,45.0
"train[""year""]=train[""datetime""].dt.year
train[""month""]=train[""datetime""].dt.month
train[""day""]=train[""datetime""].dt.day
train[""hour""]=train[""datetime""].dt.hour
train[""minute""]=train[""datetime""].dt.minute
train[""second""]=train[""datetime""].dt.second
train[""dayofweek""]=train[""datetime""].dt.dayofweek
train.shape",Yes,5,8.0
"test[""year""]=test[""datetime""].dt.year
test[""month""]=test[""datetime""].dt.month
test[""day""]=test[""datetime""].dt.day
test[""hour""]=test[""datetime""].dt.hour
test[""minute""]=test[""datetime""].dt.minute
test[""second""]=test[""datetime""].dt.second
test[""dayofweek""]=test[""datetime""].dt.dayofweek
test.shape",Yes,5,8.0
"# widspeed  0   . =>       
fig, axes = plt.subplots(nrows=2)
fig.set_size_inches(18,10)

plt.sca(axes[0])
plt.xticks(rotation=30, ha='right')
axes[0].set(ylabel='Count',title=""train windspeed"")
sns.countplot(data=train, x=""windspeed"", ax=axes[0])

plt.sca(axes[1])
plt.xticks(rotation=30, ha='right')
axes[1].set(ylabel='Count',title=""test windspeed"")
sns.countplot(data=test, x=""windspeed"", ax=axes[1])'",No,5,33.0
"b""#  0     .\ntrainWind0 = train.loc[train['windspeed'] == 0]\ntrainWindNot0 = train.loc[train['windspeed'] != 0]\nprint(trainWind0.shape)\nprint(trainWindNot0.shape)""",Yes,4,14.0
"#       .
from sklearn.ensemble import RandomForestClassifier

def predict_windspeed(data):
    
    #  0    .
    dataWind0 = data.loc[data['windspeed'] == 0]
    dataWindNot0 = data.loc[data['windspeed'] != 0]
    
    #    .
    wCol = [""season"", ""weather"", ""humidity"", ""month"", ""temp"", ""year"", ""atemp""]

    #  0     .
    dataWindNot0[""windspeed""] = dataWindNot0[""windspeed""].astype(""str"")

    #   .
    rfModel_wind = RandomForestClassifier()

    # wCol      .
    rfModel_wind.fit(dataWindNot0[wCol], dataWindNot0[""windspeed""])

    #     0     .
    wind0Values = rfModel_wind.predict(X = dataWind0[wCol])

    #       
    #        .
    predictWind0 = dataWind0
    predictWindNot0 = dataWindNot0

    #  0       .
    predictWind0[""windspeed""] = wind0Values

    # dataWindNot0 0         .
    data = predictWindNot0.append(predictWind0)

    #   float  .
    data[""windspeed""] = data[""windspeed""].astype(""float"")

    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    
    return data'",Yes,2,7.0
"# 0 .
train = predict_windspeed(train)
# test = predict_windspeed(test)

# widspeed  0   
fig, ax1 = plt.subplots()
fig.set_size_inches(18,6)

plt.sca(ax1)
#  30     
plt.xticks(rotation=30, ha='right')
ax1.set(ylabel='Count',title=""train windspeed"")
sns.countplot(data=train, x=""windspeed"", ax=ax1)'",Yes,5,56.0
"#  feature  feature 
#  feature = [""temp"",""humidity"",""windspeed"",""atemp""]
#  feature type category   . weather 1,2,3,4  2=>   . 
#  feature one-hot-encodding   .
categorical_feature_names = [""season"",""holiday"",""workingday"",""weather"",
                             ""dayofweek"",""month"",""year"",""hour""]

for var in categorical_feature_names:
    train[var] = train[var].astype(""category"")
    test[var] = test[var].astype(""category"")'",No,5,16.0
"#  dateset  
X_train = train[feature_names]

print(X_train.shape)
X_train.head()'",Yes,4,41.0
"X_test = test[feature_names]

print(X_test.shape)
X_test.head()",Yes,4,41.0
"label_name = ""count""

y_train = train[label_name]

print(y_train.shape)
y_train.head()",Yes,4,41.0
"from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    #    .
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    #    1   .
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    #       .
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    #  .
    mean_difference = difference.mean()
    
    #   .
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer'",No,3,49.0
"from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)",Yes,5,84.0
"from sklearn.ensemble import RandomForestRegressor

max_depth_list = []

# n_estimators    ;;  100 
model = RandomForestRegressor(n_estimators=100,
                              n_jobs=-1,
                              random_state=0)
model
'",Yes,5,4.0
"%time score = cross_val_score(model, X_train, y_train, cv=k_fold, scoring=rmsle_scorer)
score = score.mean()
# 0   
print(""Score= {0:.5f}"".format(score))'",No,4,28.0
"# , (     ) -      
model.fit(X_train, y_train)'",No,5,7.0
"# 
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]'",Yes,4,48.0
"#    . 
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sns.distplot(y_train,ax=ax1,bins=50)
ax1.set(title=""train"")
sns.distplot(predictions,ax=ax2,bins=50)
ax2.set(title=""test"")'",No,4,33.0
"submission = pd.read_csv(""../input/bike-sharing-demand/sampleSubmission.csv"")
submission

submission[""count""] = predictions

print(submission.shape)
submission.head()",Yes,3,45.0
"submission.to_csv(""Score_{0:.5f}_sampleSubmission.csv"".format(score), index=False)",No,5,25.0
"#carregar os dados

train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')

teste = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')",No,5,45.0
"#verificando df treino

train.info()",No,5,40.0
"#Verificando o df de teste
teste.info()",No,5,40.0
"#Transformando o dataframe original na coluna count
#vamos usar escala logaritimica

train['count'] = np.log(train['count'])
",No,5,8.0
train = train.append(teste),No,3,11.0
"train.head()

",No,5,41.0
train['datetime'] = pd.to_datetime(train['datetime']),No,5,16.0
"#crindo nova coluna usando data e hora

train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek

",No,5,16.0
"#separando o df de treino e teste

#primeiro teste

teste = train[train['count'].isnull()]",No,5,13.0
teste.shape,No,5,58.0
"#separando o df de treino e teste

#segundo treino

treino = train[~train['count'].isnull()]",No,4,13.0
treino.shape,No,5,58.0
"#Separando o df de treino em treino/validao (def = 75/25)

from sklearn.model_selection import train_test_split

treino, validacao = train_test_split(treino, random_state=42)'",No,5,13.0
"print(treino.shape)
treino.head()",Yes,4,41.0
"print(validacao.shape)
validacao.head()
",Yes,4,41.0
"#importando

from sklearn.tree import DecisionTreeRegressor",No,5,22.0
"#instanciando objeto de decision tree

ad = DecisionTreeRegressor(random_state=42)",No,5,4.0
"#treinando o modelo
#informar as colunas de entrada e a coluna de resposta (target)

ad.fit(treino[usadas], treino['count'])",No,5,7.0
"#prever os dados de validao

previsao = ad.predict(validacao[usadas])'",No,5,48.0
"#usando a metrica para validar os dados

from sklearn.metrics import mean_squared_error",No,5,22.0
"#instanciar o modelo
rf = RandomForestRegressor(random_state=42, n_jobs=1)",No,5,4.0
"#treinando o modelo
rf.fit(treino[usadas], treino['count'])",No,5,7.0
"#Fazendo previses em cima dos dados de validao

preds = rf.predict(validacao[usadas])'",No,5,48.0
"#verificando o modelo com relao a mtrica

#importando a mtrica

from sklearn.metrics import mean_squared_error'",No,5,22.0
"b""#aplicando a mtrica\nmean_squared_error(validacao['count'], preds) ** (1/2)\n#0.348204 do professor""",No,5,49.0
"#vamos prever com base nos dados de treino
# como o modelo se comporta prevendo em cima de dados conhecidos
# o modelo ja esta treinado

treino_preds = rf.predict(treino[usadas])

mean_squared_error(treino['count'], treino_preds) ** (1/2)",No,4,49.0
"b""#Gerando as previses para envio ao Kaggle\nteste['count'] = np.exp(rf.predict(teste[usadas]))""",Yes,5,48.0
"#visualizando o arquivo para envio
teste[['datetime','count']].head()",No,4,48.0
"#gerando csv
teste[['datetime','count']].to_csv('rf.csv', index=False)",No,5,25.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error",No,5,22.0
"b""# importando as bases \ntreino = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') # trino dados do dia 1 ao dia 19\nteste = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') # base de teste do dia 20 ao final do ms""",No,5,45.0
"treino.shape, teste.shape",No,5,58.0
"display(treino.info())
display(teste.info())",No,5,40.0
"b""# Aplicar log na varivel de resposta\ntreino['count'] = np.log(treino['count'])""",No,5,8.0
"# Juntando os dataframes para realizar as modificaes
# As observaes do teste ficaram com o campo count nulo
# Concatenando as bases para realizar as transformaes nas duas bases de uma vez s
treino = treino.append(teste)'",No,5,11.0
"b""# transformando o tipo da varivel datetime em datetime\ntreino['datetime'] = pd.to_datetime(treino['datetime'])""",No,5,16.0
"# Criando novas colunas com a dada e hora (feature engeneering)
treino['year'] = treino['datetime'].dt.year
treino['month'] = treino['datetime'].dt.month
treino['day'] = treino['datetime'].dt.day
treino['dayofweek'] = treino['datetime'].dt.dayofweek
treino['hour'] = treino['datetime'].dt.hour
",No,5,8.0
"# separando so dataframes
teste = treino[treino['count'].isnull()]",No,5,13.0
treino = treino[~treino['count'].isnull()],No,5,14.0
"treino, validacao = train_test_split(treino, random_state=42)",No,5,13.0
"display(treino.info())
display(validacao.info())",No,5,40.0
"b""# selecionando as variveis que sero utilizadas no treinamento\nnao_usadas = ['casual', 'registered', 'count', 'datetime']\n\n# Xriar a lista das colunas de entrada\nusadas = [c for c in treino.columns if c not in nao_usadas]""",No,2,14.0
"# Instanciando o modelo
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)",No,5,4.0
"# Treinando o modelo
random_forest.fit(treino[usadas], treino['count'])",No,5,7.0
"# Prevendo os resultados
previsao = random_forest.predict(validacao[usadas])",No,5,48.0
"# Avaliando o modelo com o SRMSLE (Square Root Mean Squared Log Error)
mean_squared_error(validacao['count'], previsao)**(1/2)",No,5,49.0
"b""# Vamos prever com base nos dados de treino\n# como o modelo se comporta prevendo em cima de dados conhecidos\n# Verificar se est generalizando bem, caso o erro seja zero na base de treino,  um forte sinal de overfitting\n\ntreino_preds = random_forest.predict(treino[usadas])\nmean_squared_error(treino['count'], treino_preds) ** (1/2)""",No,4,27.0
"b""# Gerando as previses para envio ao Kaggle\n\nteste['count'] = np.exp(random_forest.predict(teste[usadas]))""",No,5,48.0
"# Gerando o arquivo para submeter ao kaggle
teste[['datetime', 'count']].head()",No,5,41.0
"teste[['datetime', 'count']].to_csv('rf.csv' ,index=False)",No,5,25.0
"# verificando o df de treino
df.info()",No,5,40.0
"b""# vamos transformar os dados. toda a transformao dever ser replicada nos dados de teste\n\n# aplicar log na varivel de resposta\n\ndf['count'] = np.log(df['count'])""",No,5,8.0
"#apendando os dois para poder fazer a transformao de uma dez s. depois separa
df = df.append(test)'",No,5,11.0
"#convertendo a coluna datetime

df['datetime'] = pd.to_datetime(df['datetime'])",No,5,16.0
"#criando novas colunas usando data e hora
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek",No,5,8.0
"b""# dividir os dados que foram juntados para a transformao - treino e teste. se estiver dado nulo nas tres variveis target\n# pertence ao df de teste\n\n# primeiro os dados de teste\ntest = df[df['count'].isnull()]""",No,5,13.0
"b""# agora os dados de treino o sinal de til  a negao quando uma comparao no est envolvida\n\ndf = df[~df['count'].isnull()]""",No,5,14.0
"# dividindo o df de treino

# importando o scikitlearn para a diviso da base

from sklearn.model_selection import train_test_split'",No,5,22.0
"#dividir 75% treino e 25% validao - padro
train, valid = train_test_split(df, random_state=42)
'",No,5,13.0
"b""#escolher as colunas que vo ser usadas e as que no\n\n# lista das colunas no usadas\nremoved_cols = ['casual', 'registered', 'count', 'datetime']\n\n#lista das columas de entrada\n\nfeats = [c for c in train.columns if c not in removed_cols]\n""",No,2,14.0
feats,No,3,71.0
"# usando o random forest

# importando o modelo

from sklearn.ensemble import RandomForestRegressor",No,5,22.0
"# instanciar o modelo
rf = RandomForestRegressor(random_state=42,n_jobs=-1)
#n_jobs nr de job que rodam e paralalo para dar o fit. -1  #para usar todos os processadores
#n_estimator  - nro de arvores. o defalt  10 mas mudar para 100 na proxima verso 0.22
'",No,5,4.0
"#treinar o modelo com os dados de treino

rf.fit(train[feats], train['count'])",No,5,7.0
"#faznedo as previses em cima dos dados de validao
preds = rf.predict(valid[feats])'",No,5,48.0
"b""#aplicando a mtrica\nmean_squared_error(valid['count'], preds) ** (1/2)\n""",No,5,49.0
train_preds = rf.predict(train[feats]),No,5,27.0
"# aplicando nos dados de treino
#dados conhecidos

mean_squared_error(train['count'], train_preds) ** (1/2)",No,5,28.0
"
test['count'] = np.exp(rf.predict(test[feats]))",No,5,48.0
"test[['datetime', 'count']].head()",No,5,41.0
"b""#Aumentar a floresta - 200 rvores\n# instanciar o modelo\nrf2 = RandomForestRegressor(random_state=42,n_jobs=-1, n_estimators=200, min_samples_leaf=5)\n#n_jobs nr de job que rodam e paralalo para dar o fit. -1  #para usar todos os processadores\n#n_estimator  - nro de arvores. o defalt  10 mas mudar para 100 na proxima verso 0.22\n\n#treinar o modelo com os dados de treino\n\nrf2.fit(train[feats], train['count'])\n\n#Previses com os dados de validao\npreds2 = rf2.predict(valid[feats])\n\n# Aplicar mtrica sobre os dados de validao\n\nmean_squared_error(valid['count'], preds2)**(1/2)""",Yes,4,7.0
"#Gerando o novo arquivo
test['count'] = np.exp(rf2.predict(test[feats]))

#visualizando o arquivo para envio
test[['datetime', 'count']].to_csv('rf2.csv', index=False)",Yes,5,25.0
"#Pandas Rolling

df = df.append(test)",No,5,11.0
"#ordenando o dataframe

df.sort_values('datetime', inplace=True)",No,5,9.0
"#Criando a coluna  rolling_temp
df['rolling_temp'] = df['temp'].rolling(3,min_periods=1).mean()",No,4,8.0
"#Criando a coluna  rolling_atemp
df['rolling_atemp'] = df['atemp'].rolling(3,min_periods=1).mean()",No,5,8.0
"#Separando os dataframes

test = df[df['casual'].isnull()]

df = df[~df['casual'].isnull()]",No,5,14.0
"# Dividindo os dados de treino em train and validation

train, valid = train_test_split(df, random_state=42)",No,5,13.0
"#escolher as colunas que vo ser usadas 

#lista das columas de entrada

feats = [c for c in train.columns if c not in removed_cols]'",No,3,14.0
"b""#Novo modelo usando colunas roling\n# instanciar o modelo\nrf3 = RandomForestRegressor(random_state=42,n_jobs=-1, n_estimators=200, min_samples_leaf=5)\n#n_jobs nr de job que rodam e paralalo para dar o fit. -1  #para usar todos os processadores\n#n_estimator  - nro de arvores. o defalt  10 mas mudar para 100 na proxima verso 0.22\n\n#treinar o modelo com os dados de treino\n\nrf3.fit(train[feats], train['count'])\n\n#Previses com os dados de validao\npreds3 = rf3.predict(valid[feats])\n\n# Aplicar mtrica sobre os dados de validao\n\nmean_squared_error(valid['count'], preds3)**(1/2)""",Yes,3,7.0
"#Gerando o novo arquivo
test['count'] = np.exp(rf3.predict(test[feats]))

#visualizando o arquivo para envio
test[['datetime', 'count']].to_csv('rf3.csv', index=False)",Yes,4,48.0
"# import modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from matplotlib import pyplot as plt
from datetime import datetime as dt
import seaborn as sns
# set graphics dark mode
plt.style.use('dark_background')
# import dataset
trainset = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
testset = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
# dataset quick view
trainset.head()",Yes,3,45.0
"# create date feature from datetime
trainset.insert(1, 'date', pd.DataFrame([x[:10] for x in trainset.datetime]))
# create time feature from datetime
trainset.insert(2, 'time', pd.DataFrame([x[11:] for x in trainset.datetime]))
# convert datetime from string to datetime
trainset.date = [dt.strptime(x, '%Y-%m-%d').weekday() for x in trainset.date]
# drop datetime column since we created two variables and casual and registered since their value is contained in count
trainset.drop(['datetime'], axis = 1, inplace = True)
# trainset quick view
trainset.head()",Yes,4,16.0
"# get index of the time elements in unique list
_, idx = np.unique(trainset.time, return_inverse = True)
# replace time feature with the index just computed
trainset.time = idx
# trainset quick view
trainset.head()",No,5,8.0
"# date - count boxplot
plt.figure(), sns.boxplot(x = trainset['date'], y = trainset['count'])",No,5,75.0
"# check sum of nulls
trainset.isnull().sum()",No,5,39.0
"# draw the pairplot of the variables
plt.figure(), sns.pairplot(trainset)
# check target boxplot to see outliers
plt.figure(), sns.boxplot(trainset['count'])",Yes,5,81.0
"# apply log transform to remove the number of outliers
trainset['count'] = np.log(trainset['count'])
# repeat pairplot
plt.figure(), sns.pairplot(trainset)
# repeat boxplot
plt.figure(), sns.boxplot(trainset['count'])",Yes,5,81.0
"# variables correlation heatmap
plt.figure(figsize = (10,10)), sns.heatmap(trainset.corr())",No,5,80.0
"# remove features highly correlated
trainset.drop(['casual','registered','temp'], axis = 1, inplace = True)
# graph heatmap again
plt.figure(figsize = (10,10)), sns.heatmap(trainset.corr())",Yes,3,10.0
"# group time values into day segments
trainset.time = [0 if x >= 0 and x < 6 else(1 if x > 5 and x < 13 else (2 if x > 12 and x < 19 else 3)) for x in trainset.time]
# trainset quick view
trainset.head()",Yes,4,16.0
"# get original datetime column for submission
testdates = testset.datetime
# create date feature from datetime
testset.insert(1, 'date', pd.DataFrame([x[:10] for x in testset.datetime]))
# create time feature from datetime
testset.insert(2, 'time', pd.DataFrame([x[11:] for x in testset.datetime]))
# convert datetime from string to datetime
testset.date = [dt.strptime(x, '%Y-%m-%d').weekday() for x in testset.date]
# drop datetime column since we created two variables and casual and registered since their value is contained in count
testset.drop(['datetime'], axis = 1, inplace = True)
# get index of the time elements in unique list
_, idx = np.unique(testset.time, return_inverse = True)
# replace time feature with the index just computed
testset.time = idx
# replace date with weekday
testset.date = [1 if x >= 0 and x < 6 else 0 for x in testset.date]
# replace feature name
testset.rename(columns = {'date':'weekday'}, inplace = True)
# remove features highly correlated
testset.drop(['temp'], axis = 1, inplace = True)
# group time values into day segments
testset.time = [0 if x >= 0 and x < 6 else(1 if x > 5 and x < 13 else (2 if x > 12 and x < 19 else 3)) for x in testset.time]
# testset quick view
testset.head()",Yes,5,8.0
"# features
Xtrain = trainset.iloc[:,:-1]
Xtest = testset.iloc[:,:]
# target
ytrain = trainset.iloc[:,-1]
# standard scaler
sca = StandardScaler().fit(Xtrain)
# standarize features
Xtrain = sca.transform(Xtrain)
Xtest = sca.transform(Xtest)
# classifier
clf = RandomForestRegressor(random_state = 0)
# regularization parameter range
param_grid = {'n_estimators': [25, 50, 100], 'max_features': [3, 6]}
# grid search
grid = GridSearchCV(estimator = clf, scoring = 'neg_mean_squared_log_error', param_grid = param_grid)
# training
clf.fit(Xtrain, ytrain)
# predictions
preds = np.round(np.exp(clf.predict(Xtest)))
# clip negatives in case there are
preds[preds < 0] = 0
# submission
pd.DataFrame({'datetime': testdates, 'count': preds}).to_csv('my_submission.csv', index = False)",Yes,3,6.0
"import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import boxcox, inv_boxcox

train_df=pd.read_csv('../input/train.csv')

train_df.describe()",No,3,45.0
"sns.boxplot(train_df['count'])
plt.show()

cnt=train_df['count'].values
q99=np.percentile(cnt,[99])

train_df=train_df[train_df['count']<q99[0]]
sns.distplot(train_df['count'])
plt.show()",Yes,5,33.0
"from scipy.stats import boxcox
sns.distplot(boxcox(train_df['count'])[0])
plt.show()

sns.distplot(train_df['count'].apply(lambda x:np.log(x)))
plt.show()

sns.distplot(train_df['count'].apply(lambda x:x**0.5))
plt.show()
train_df['count']=train_df['count'].apply(lambda x:np.log(x))",Yes,3,33.0
"

from datetime import datetime

#converting string dattime to datetime


train_df['datetime']=train_df['datetime'].apply(lambda x:datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

new_df=train_df

new_df['month']=new_df['datetime'].apply(lambda x:x.month)
new_df['hour']=new_df['datetime'].apply(lambda x:x.hour)",Yes,5,8.0
"new_df.cov()
sns.heatmap(new_df.corr())
plt.show()

sns.boxplot('windspeed',data=train_df)
",Yes,5,80.0
new_df.corr(),No,5,40.0
"'''
humid + temp + month + hour + season + weather
humid + temp + month + hour
humid + temp + month
humid + temp
'''

final_df=new_df.drop(['datetime', 'holiday', 'workingday', 'atemp', 'windspeed', 'casual', 'registered'], axis=1)

#final_df=new_df.drop(['datetime', 'holiday', 'workingday', 'atemp', 'windspeed', 'casual', 'registered', 'season', 'weather'], axis=1)

#final_df=new_df.drop(['datetime', 'holiday', 'workingday', 'atemp', 'windspeed', 'casual', 'registered', 'season', 'weather', 'hour'], axis=1)

#final_df=new_df.drop(['datetime', 'holiday', 'workingday', 'atemp', 'windspeed', 'casual', 'registered', 'season', 'weather', 'hour', 'month'], axis=1)


final_df.head()


",Yes,4,10.0
"
weather_df=pd.get_dummies(new_df['weather'],prefix='w',drop_first=True)
#year_df=pd.get_dummies(new_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(new_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(new_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(new_df['season'],prefix='s',drop_first=True)
                     

final_df=final_df.join(weather_df)
#final_df=final_df.join(year_df)
final_df=final_df.join(month_df)                     
final_df=final_df.join(hour_df)
final_df=final_df.join(season_df)
                     
final_df.head()

'''
#weather_df=pd.get_dummies(new_df['weather'],prefix='w',drop_first=True)
# year_df=pd.get_dummies(new_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(new_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(new_df['hour'],prefix='h',drop_first=True)
#season_df=pd.get_dummies(new_df['season'],prefix='s',drop_first=True)
                     
#final_df=final_df.join(weather_df)
# final_df=final_df.join(year_df)
final_df=final_df.join(month_df)                     
final_df=final_df.join(hour_df)
#final_df=final_df.join(season_df)
'''

'''
#weather_df=pd.get_dummies(new_df['weather'],prefix='w',drop_first=True)
# year_df=pd.get_dummies(new_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(new_df['month'],prefix='m',drop_first=True)
#hour_df=pd.get_dummies(new_df['hour'],prefix='h',drop_first=True)
#season_df=pd.get_dummies(new_df['season'],prefix='s',drop_first=True)
                     
#final_df=final_df.join(weather_df)
# final_df=final_df.join(year_df)
final_df=final_df.join(month_df)                     
#final_df=final_df.join(hour_df)
#final_df=final_df.join(season_df)
'''

'''
#weather_df=pd.get_dummies(new_df['weather'],prefix='w',drop_first=True)
# year_df=pd.get_dummies(new_df['year'],prefix='y',drop_first=True)
#month_df=pd.get_dummies(new_df['month'],prefix='m',drop_first=True)
#hour_df=pd.get_dummies(new_df['hour'],prefix='h',drop_first=True)
#season_df=pd.get_dummies(new_df['season'],prefix='s',drop_first=True)
                     
#final_df=final_df.join(weather_df)
# final_df=final_df.join(year_df)
#final_df=final_df.join(month_df)                     
#final_df=final_df.join(hour_df)
#final_df=final_df.join(season_df)
'''
",Yes,4,20.0
"
X=final_df.iloc[:,final_df.columns!='count'].values
print (X)

Y=final_df.iloc[:,4].values
# Y=final_df.iloc[:,2].values

print (Y)",No,3,21.0
"import xgboost as xg
xgr=xg.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4)
xgr.fit(X,Y)

'''import xgboost as xg
from sklearn.model_selection import GridSearchCV

def grid_search():
    
    xgr=xg.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4)
    xgr.fit(X,Y)

    #rf=RandomForestRegressor(n_estimators=100,random_state=0)
    #rf.fit(X,Y)

    #parameters=[{'max_depth':[8,9,10,11,12],'min_child_weight':[4,5,6,7,8]}]
    #parameters=[{'gamma':[i/10.0 for i in range(0,5)]}]
    parameters=[{'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]}]

    grid_search= GridSearchCV(estimator=xgr, param_grid=parameters, cv=10,n_jobs=-1)

    print (1)
    grid_search=grid_search.fit(X,Y)
    print (2)
    best_accuracy=grid_search.best_score_
    best_parameters=grid_search.best_params_
    print (best_accuracy)
    print (best_parameters)

#if __name__ == '__main__':
   #grid_search()'''",Yes,5,6.0
new_df.head(),No,5,41.0
"new_df=pd.read_csv('../input/test.csv')
test_df=pd.read_csv('../input/test.csv')

new_df['datetime']=new_df['datetime'].apply(lambda x:datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

new_df['month']=new_df['datetime'].apply(lambda x:x.month)
new_df['hour']=new_df['datetime'].apply(lambda x:x.hour)

weather_df=pd.get_dummies(new_df['weather'],prefix='w',drop_first=True)
month_df=pd.get_dummies(new_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(new_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(new_df['season'],prefix='s',drop_first=True)


new_df=new_df.join(weather_df)
new_df=new_df.join(month_df)                     
new_df=new_df.join(hour_df)
new_df=new_df.join(season_df)

new_df=new_df.drop(['datetime', 'holiday', 'workingday', 'atemp', 'windspeed'], axis=1)
new_df.head()
                     ",Yes,3,45.0
"import xgboost as xg
xgr=xg.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4,colsample_bytree=0.6,subsample=0.6)
xgr.fit(X,Y)

X_test=new_df.iloc[:,:].values
X_test.shape
#print (new_df.columns)

y_output=xgr.predict(X_test)
y_output

test_df['count'] = pd.Series(np.exp(y_output))
test_df = test_df.drop(['humidity', 'temp', 'season', 'holiday', 'workingday', 'weather', 'atemp', 'windspeed'], axis=1)
test_df.to_csv('sub1.csv', index=False)

",Yes,3,7.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

#regression  =>   '",Yes,5,88.0
"train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv',parse_dates = [""datetime""])#     object    
train.head(30) #   5   .() .
train['hour']=train['datetime'].dt.hour   #hour, year    
train['year'] = train['datetime'].dt.year
train['dayofweek']=train['datetime'].dt.dayofweek #weekday
train['day']=train['datetime'].dt.day
train['month']=train['datetime'].dt.month
#train['week']=train['datetime'].dt.week

train.head(30)'",Yes,3,8.0
"test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv',parse_dates = [""datetime""])#    (count)  .
test['hour'] = test['datetime'].dt.hour
test['year'] = test['datetime'].dt.year
test['dayofweek']=test['datetime'].dt.dayofweek
test['day']=test['datetime'].dt.day
test['month']=test['datetime'].dt.month
#test['week']=test['datetime'].dt.week
test.head(30)'",Yes,3,8.0
"b""weekday_df=train[train['workingday']==1] # ==.    workingday 1 True => True  . \nprint(weekday_df.shape) #  \n\nweekend_df=train[train['workingday']==0]\nweekend_df.shape""",No,4,14.0
"b""import matplotlib.pyplot as plt #   matplotlib  \nimport seaborn as sns #    \na,b = plt.subplots(1,1,figsize=(20,12)) #,, \nsns.boxplot(train['hour'],train['count']) #boxplot => (count), (hour)     \n#   => \n#    =>  25%\n#     => \n#       => .     .   \n# 10 ~15   . =>   5    ,     . =>     .=>     .\n""",Yes,5,75.0
"b""a,b = plt.subplots(1,1,figsize=(20,12))\nsns.distplot(train['count'])#   \n#   \n#  ,\n#1.   . \n#2.  .(  )\n\n# y(train count)  \n# train   test   . \n#  ? =>  .         .       . \n#=>           => y   .\n#=>           . \n\n""",No,5,33.0
"b""a,b = plt.subplots(1,1,figsize=(20,12)) \nsns.boxplot(train['month'],train['count']) #tree  column    1    column .  day column     . """,No,5,33.0
"a,b = plt.subplots(1,1,figsize=(20,12)) 
sns.boxplot(weekday_df['hour'],weekday_df['count']) ",No,5,75.0
"a,b = plt.subplots(1,1,figsize=(20,12)) 
sns.boxplot(weekend_df['hour'],weekend_df['count']) ",No,5,75.0
"figure, (a,b,c,d,e,f) = plt.subplots(nrows=6)
figure.set_size_inches(18,25)

sns.pointplot(train['hour'],train['count'], ax = a)
sns.pointplot(train['hour'],train['count'],hue = train['workingday'], ax = b)
sns.pointplot(train['hour'],train['count'],hue = train['holiday'], ax = c)
sns.pointplot(train['hour'],train['count'],hue = train['dayofweek'], ax = d)
sns.pointplot(train['hour'],train['count'],hue = train['season'], ax = e)
sns.pointplot(train['hour'],train['count'],hue = train['weather'], ax = f)",No,5,75.0
"b""print(train.groupby('year')['count'].mean()) #   . \ntrain.groupby('year')['count'].median() #. =>  """,No,2,40.0
"b""train_2011=train[train['year']==2011] #2011 .\ntrain_2011.groupby('month')['count'].mean()\na,b=plt.subplots(1,1,figsize=(20,12))\nsns.boxplot(train_2011['month'],train['count'])""",No,4,33.0
"b""print(train.groupby('dayofweek')['count'].mean()) # 0~6 => ~\ntrain.groupby('holiday')['count'].mean() #    ?""",No,3,40.0
"b""train['dayofweek'].value_counts() #     .        .     . """,No,5,72.0
train.dtypes #datetime    object.   .       .,No,5,70.0
"b""train2 = train.drop(['datetime','casual','registered','count','month','day'],axis=1) #   datetime, test  3   .    . \n# train2   4     . axis=0 -> row . axis=1 -> column . \ntrain2.head()""",Yes,5,10.0
"b""test2 = test.drop(['datetime','month','day'],axis=1) # test    datetime   \ntest2.head()""",Yes,4,10.0
"b""# # \n# from sklearn.ensemble import RandomForestRegressor \n\n# # \n# rf = RandomForestRegressor(n_estimators=100,random_state=1,n_jobs=4) #  / /\n\n#; 10->    =>  100   /      / \n#random_state =>  / n_jobs =4 , \n\n# # 100 ? =>       .  train set \n#     . test set   a=b    \n#      \n\n# #\n# rf.fit(train2,np.log(train['count']))\n# #\n# result = rf.predict(test2)\n# test['count']= result #test count   result   . \n# test.head(10)\n\n# # column  ,       .    . \n# #     \n# #1.       .\n# #2.     . \n\n\n#   100 .    lgbm          . \n# lgbm\n# from lightgbm import LGBMRegressor\n# lgb=LGBMRegressor()\n# lgb.fit(train2,np.log(train['count']))\n# result=lgb.predict(test2)\n# test['count']=result\n# test.head()\n\n#  lgbm xgboost   ? => X.   . \n\n#2 xgboost =>   .   100 .\n# *  .    .   .\n\n# # ***\n# tree =>\n# tree .        => max_depth\n#xgb=> max_depth 3 .       . \n\n\nfrom xgboost import XGBRegressor\nxgb=XGBRegressor(nthread=4,max_depth=5) #   .     . nthread=>   . n_jobs  . CPU  .\nxgb.fit(train2,np.log(train['count']))\nresult=xgb.predict(test2)""",Yes,4,7.0
"Sub = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
Sub.head()",Yes,4,45.0
"b""Sub['count'] = np.exp(result) #  #\nSub.head()""",Yes,5,55.0
"b""Sub.to_csv('20191231.csv',index=False) # index=False  index=True    .   column 2. """,No,5,25.0
"import pandas as pd
%matplotlib inline
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import warnings; warnings.simplefilter('ignore')

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))'",Yes,5,88.0
"train=data=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
train.info()
Y1train=train['casual']
Y2train=train['registered']
Ytrain=train['count']
",No,3,45.0
"figure, axs = plt.subplots(nrows=3, ncols=2)

figure.set_size_inches(14,6)
sns.distplot(Ytrain, ax=axs[0][0], fit=norm)
sns.distplot(np.log(Ytrain+1), ax=axs[0][1], fit=norm)

sns.distplot(Y1train, ax=axs[1][0], fit=norm)
sns.distplot(np.log(Y1train+1), ax=axs[1][1], fit=norm)

sns.distplot(Y2train, ax=axs[2][0], fit=norm)
sns.distplot(np.log(Y2train+1), ax=axs[2][1], fit=norm)",No,5,33.0
"feature_names=list(test)
train=train[feature_names]
all_data=pd.concat((train, test))
print(train.shape, test.shape, all_data.shape)
print(Ytrain)

all_data['datetime']=pd.to_datetime(all_data['datetime'])
all_data['year']=all_data['datetime'].dt.year
all_data['month']=all_data['datetime'].dt.month
all_data['day']=all_data['datetime'].dt.day
all_data['hour']=all_data['datetime'].dt.hour
all_data['dayofweek']=all_data['datetime'].dt.dayofweek
all_data=all_data.drop(columns='datetime')

all_data.loc[all_data['windspeed']==0, 'windspeed']=all_data['windspeed'].mean()
print(train.shape, test.shape, all_data.shape)",Yes,3,8.0
"Xtrain=all_data[:len(train)]
Xtest=all_data[len(train):]
Xtrain.info()
#""""""
import itertools
import copy
tmpXtrain = copy.deepcopy(Xtrain)
tmpXtest = copy.deepcopy(Xtest)

for cmb in itertools.combinations_with_replacement(list(Xtrain.keys()), 2):
    tmpXtrain[""-"".join(cmb)] = Xtrain[cmb[0]] * Xtrain[cmb[1]]
    tmpXtest[""-"".join(cmb)] = Xtest[cmb[0]] * Xtest[cmb[1]]
#""""""",Yes,4,13.0
!pip install optuna,No,5,87.0
"import optuna.integration.lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import datasets

X_train, X_test, y_train, y_test = train_test_split(tmpXtrain, np.log1p(Y1train), test_size=0.1)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
}
best_params, tuning_history = dict(), list()
booster_casual = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval,
                    verbose_eval=0,
                    best_params=best_params,
                   tuning_history=tuning_history)
print(""Best Params:"", best_params)
print(""Tuning history:"", tuning_history)
'",Yes,3,7.0
"X_train, X_test, y_train, y_test = train_test_split(tmpXtrain, np.log1p(Y2train), test_size=0.1)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
}
best_params, tuning_history = dict(), list()
booster_registered = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval,
                    verbose_eval=0,
                    best_params=best_params,
                   tuning_history=tuning_history)
print(""Best Params:"", best_params)
print(""Tuning history:"", tuning_history)'",Yes,3,13.0
"pred_casual = booster_casual.predict(tmpXtest, num_iteration=booster_casual.best_iteration)
pred_casual = np.expm1(pred_casual)

pred_registered = booster_registered.predict(tmpXtest, num_iteration=booster_registered.best_iteration)
pred_registered = np.expm1(pred_registered)

pred = pred_casual + pred_registered
pred[pred<0] = 0

submission = pd.DataFrame({'datetime': test.datetime, 'count': pred},
                          columns=['datetime', 'count'])
submission.to_csv(""submission.csv"", index=False)'",Yes,4,25.0
"b""#  \nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nfrom matplotlib import pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom pathlib import Path\nfrom IPython.display import display\nfrom datetime import datetime\nfrom pandas import DataFrame\nfrom typing import List, NamedTuple, Tuple\n\n# allow plots to appear directly in the notebook\n%matplotlib \n\n# Supress Warnings\nimport warnings\nwarnings.filterwarnings('ignore')\n\npd.set_option('display.max_rows', 100)\npd.set_option('display.max_columns', 50)\npd.set_option('display.width', 1000)""",Yes,5,23.0
"#   
def load(path: Path) -> DataFrame:
#     return pd.read_csv(path)
    return pd.read_csv(path, parse_dates=True, index_col=""datetime"")'",No,5,45.0
"ROOT_DIR = Path(""/kaggle/input/bike-sharing-demand"")
TRAIN_DATA_PATH = ROOT_DIR / ""train.csv""
TEST_DATA_PATH = ROOT_DIR / ""test.csv""
TARGET_NAME = 'count'

original_train: DataFrame = load(TRAIN_DATA_PATH)
original_test: DataFrame = load(TEST_DATA_PATH)'",No,4,45.0
original_train.head(),No,5,41.0
original_test.head(),No,5,41.0
"only_train_columns = [c1 for c1 in original_train.columns if not c1 in original_test.columns and c1 != TARGET_NAME]
print('Only Train Columns')
print(only_train_columns)
only_test_columns = [c1 for c1 in original_test.columns if not c1 in original_train.columns]
print('Only Test Columns')
print(only_test_columns)",Yes,3,13.0
original_train.isnull().sum(),No,5,39.0
original_test.isnull().sum(),No,5,39.0
original_train.dtypes,No,5,70.0
original_test.dtypes,No,5,70.0
"feature_columns = [c1 for c1 in original_train.columns if c1 in original_test.columns and c1 != TARGET_NAME]
feature_columns",No,4,14.0
sns.distplot(original_train[TARGET_NAME]),No,5,33.0
"def draw_distplot(df, name, fig, m, n, idx):
    ax = fig.add_subplot(m, n, idx)
    ax = sns.distplot(df[name])

def draw_distplots(df, columns):
    M = round(len(columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(columns):
        draw_distplot(df=df, name=name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_distplots(df=original_train, columns=feature_columns)",No,5,33.0
sns.pairplot(original_train[feature_columns]),No,3,33.0
"sns.heatmap(original_train[feature_columns].corr(), annot=True)",No,5,80.0
"def draw_boxplot(df, x_name, y_name, fig, m, n, idx): 
    ax = fig.add_subplot(m, n, idx)
    ax = sns.boxplot(data=df, x=x_name, y=y_name)

def draw_boxplots(df, x_columns, y_name):
    M = round(len(x_columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(x_columns):
        draw_boxplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_boxplots(df=original_train, x_columns=categorical_feature_columns, y_name=TARGET_NAME)",No,5,33.0
"def draw_scatterplot(df, x_name, y_name, fig, m, n, idx): 
    ax = fig.add_subplot(m, n, idx)
    ax = sns.scatterplot(data=df, x=x_name, y=y_name)

def draw_scatterplots(df, x_columns, y_name):
    M = round(len(x_columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(x_columns):
        draw_scatterplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_scatterplots(df=original_train, x_columns=numeric_feature_columns, y_name=TARGET_NAME)",No,5,33.0
"train_data = original_train.copy()
test_data = original_test.copy()",No,5,12.0
"#categorical columns change to one-hot encoding data

def replaced_with_onehot_cols(data: DataFrame, col_names: List[str]) -> DataFrame:
    data = data.copy()
    
    for col_name in col_names:
        one_hot = pd.get_dummies(data[col_name], prefix=col_name)
        data = data.join(one_hot)
        
        # Original column is not needed anymore
        del data[col_name]
    return data",No,5,20.0
"train_data = replaced_with_onehot_cols(data=train_data, col_names=categorical_feature_columns)
test_data = replaced_with_onehot_cols(data=test_data, col_names=categorical_feature_columns)",No,3,20.0
"#remove only_train_columns
train_data = train_data.drop(only_train_columns, axis=1)
train_data.head()",Yes,4,10.0
"#seperate datetime index
def expanded_index_datetime_col(data: DataFrame) -> DataFrame:
    data = data.copy()
    data[""hour""] = data.index.hour
    data[""weekday""] = data.index.weekday
    data[""month""] = data.index.month
    data[""year""] = data.index.year
    return data",No,5,8.0
"train_data = expanded_index_datetime_col(data=train_data)
test_data = expanded_index_datetime_col(data=test_data)
train_data.head()",Yes,3,16.0
"#change datetime data to one-hot data
datetime_cols = ['hour', 'weekday','month','year']
train_data = replaced_with_onehot_cols(data=train_data, col_names=datetime_cols)
test_data = replaced_with_onehot_cols(data=test_data, col_names=datetime_cols)
train_data.head()",Yes,4,20.0
"from sklearn.preprocessing import MinMaxScaler

def normalize_cols(df: DataFrame, scaler) -> DataFrame:
    df = df.copy()
    return DataFrame(scaler.fit_transform(df.values), columns=df.columns, index=df.index)

x_scaler = MinMaxScaler()
x = train_data.drop(TARGET_NAME, axis=1)
x = normalize_cols(df=x, scaler=x_scaler)

y_scaler = MinMaxScaler()
y = train_data[[TARGET_NAME]]
y = normalize_cols(df=y, scaler=y_scaler)

test_data =  normalize_cols(df=test_data, scaler=x_scaler)",Yes,5,21.0
x.head(),No,5,41.0
y.head(),No,5,41.0
"from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

result = model.fit(x.values, y.values)",Yes,5,7.0
"features = pd.DataFrame()
features[""features""] = x.columns
features[""coefficient""] = model.feature_importances_

features.sort_values(by=[""coefficient""], ascending=False, inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,20)
sns.barplot(data=features, x=""coefficient"", y=""features"");",No,5,79.0
"!pip install livelossplot tensorflow-gpu
import tensorflow.keras.backend as K
import tensorflow as tf",Yes,5,87.0
"#Split train, test data
from sklearn.model_selection import train_test_split
random_seed = 5

x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=random_seed)

print('train data count : ' + str(len(x_train)))
print('test data count : ' + str(len(x_validation)))",Yes,5,13.0
"#make cost function
from sklearn import metrics
def rmsle_K(y, pred):
    return K.sqrt(K.mean(K.square(tf.math.log1p(y) - tf.math.log1p(pred))))
def rmsle(y, pred):
    return np.sqrt(metrics.mean_squared_error(y, pred))",Yes,4,49.0
"#Make DL Models
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

_, NUM_FEATURES = x_train.shape

def make_dl_model()-> Model:
    input = Input(shape=(NUM_FEATURES, ))
    _ = Dense(32, activation='relu')(input)
    _ = Dropout(0.4)(_)
    _ = Dense(32, activation='relu')(_)
    _ = Dropout(0.4)(_)
    _ = Dense(16, activation='relu')(_)
    output = Dense(1, activation='relu')(_)
    model = Model(inputs=input, outputs=output)
    model.compile(optimizer='adam', loss=rmsle_K, metrics=['mse'])
    return model
",Yes,4,4.0
"model = make_dl_model()
model.fit(x_train,y_train, validation_data=(x_validation,y_validation),
         epochs=200, batch_size=128, verbose=1,
         callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=5, min_lr=0.000001, verbose=1),
                    EarlyStopping(monitor=""val_loss"", patience=10, verbose=0),
                    ]
         )
results.append(ModelResult(name='DL',cost=rmsle(y_validation, model.predict(x_validation)), model=model))'",Yes,4,7.0
"#Make ML Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

ml_models = {
    'LinearRegression': LinearRegression(),
    'LassoRegression': Lasso(),
    'RidgeRegression': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor()
}

#train ML Models
for name, model in ml_models.items():
    model.fit(x_train, y_train)
    results.append(ModelResult(name=name,cost=rmsle(y_validation, model.predict(x_validation)), model=model))",Yes,5,3.0
"best_model = sorted_result[0].model
y_pred = best_model.predict(test_data)
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).astype(int)",Yes,4,3.0
"#Save Submission
submission = test_data.copy()
submission[""datetime""] = test_data.index
submission[""count""] = y_pred.astype(int)
submission = submission[[""datetime"", ""count""]]
submission.to_csv('submission.csv', index=False)
'",Yes,5,25.0
"#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline",Yes,5,23.0
"from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = ""all""
",Yes,5,23.0
"sns.set(style=""dark"")
sns.set(style=""whitegrid"", color_codes=True)",No,5,23.0
"train=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
print('train shape:',train.shape)
print('test shape:',test.shape)",Yes,4,45.0
"#check for null data
train.isnull().sum()",No,5,39.0
"import missingno as msno

fig,ax=plt.subplots(2,1,figsize=(10,5))

msno.matrix(train,ax=ax[0])
ax[0].set_title('Train Data')
msno.matrix(test,ax=ax[1])
ax[1].set_title('Test Data')",Yes,5,34.0
"#variable datatype:
train.info()",No,5,40.0
"from datetime import datetime
from dateutil import parser
import calendar

#parse string datetime into datetime format
train['datetime2']=train.datetime.apply(lambda x: parser.parse(x))

#Get some different time variables
train['year']=train.datetime2.apply(lambda x: x.year)
train['month']=train.datetime2.apply(lambda x: x.month)
train['weekday']=train.datetime2.apply(lambda x: x.weekday())
train['weekday_name']=train.datetime2.apply(lambda x: calendar.day_name[x.weekday()])
train['hour']=train.datetime2.apply(lambda x: x.hour)
",Yes,5,8.0
"#create categorical data
train['season_decode']=train.season.map({1:'spring',2:'summer',3:'fall',4:'winter'})
train['working_decode']=train.workingday.map({1:'work',0:'notwork'})
train['weather_decode']=train.weather.map({1:'Clear',2:'Mist',3:'LightRain',4:'HeavyRain'})",No,5,20.0
"f,ax=plt.subplots(1,2)
sns.distplot(train['count'],bins=30,ax=ax[0])
ax[0].set_title('count distrib')
sns.boxplot(data=train,y=train['count'],ax=ax[1])
ax[1].set_title('count boxplot')",No,5,33.0
"mean_count=train['count'].mean()
std_count=train['count'].std()
print(mean_count-3*std_count)
print(mean_count+3*std_count)
outliers1=train[train['count']>(mean_count+3*std_count)]
len(outliers1['count'])",No,3,40.0
"train2=train[train['count']<=(mean_count+3*std_count)]
train2.shape",Yes,4,14.0
"#Season
sns.boxplot(data=train2,y=train2['count'],x=train['season_decode']).set_title('Demand by season')",No,5,33.0
"#Year

train2.groupby(['year','month'])['count'].mean().plot().set_title('demand by year')
",No,5,75.0
"#WeekDay & Hour:
week_hour=train2.groupby(['weekday_name','hour'])['count'].mean().unstack()
week_hour=week_hour.reindex(index=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])


plt.figure(figsize=(15,6))
cmap2 = sns.cubehelix_palette(start=2,light=1, as_cmap=True)

sns.heatmap(week_hour,cmap=cmap2).set_title('Demand by Day-Hour')",Yes,5,80.0
"#Difference between casual and resgitered
train2.groupby(['hour'])['casual','registered','count'].mean().plot().set_title('Demand by hour')
",No,5,33.0
"
train2.groupby(['weekday_name'])['casual','registered','count'].mean().plot(kind='bar').set_title('demand by day of week')
",No,5,33.0
"#Weather
train2.groupby(['weather_decode'])['casual','registered'].mean().plot(kind='bar').set_title('demand by weather')",No,5,33.0
"#Temp
season_temp=train2.groupby(['season_decode','temp'])['count'].mean().unstack()


plt.figure(figsize=(15,8))
cmap3 = sns.cubehelix_palette(start=6,light=1, as_cmap=True)

sns.heatmap(season_temp,cmap=cmap3).set_title('demand by season and temperature')",Yes,5,80.0
"Correlation_Matrix=train2[['holiday','workingday','weather','temp','atemp','humidity','windspeed','casual','registered','count']].corr()
mask = np.array(Correlation_Matrix)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(Correlation_Matrix,mask=mask,vmax=.8,annot=True,square=True)",No,5,80.0
"#preparing data sets for random forest
X=train2[['season','holiday','workingday','weather','temp','atemp','humidity','windspeed','year','month','weekday','hour']]

y_count=train2['count']
y_casual=train2['casual']
y_reg=train2['registered']",No,5,21.0
"from sklearn.preprocessing import StandardScaler

#Scaled all distributions
X_Scaled=StandardScaler().fit_transform(X=X)",Yes,5,18.0
"from sklearn.model_selection import train_test_split
#Split for train-test
X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_count, test_size=0.25, random_state=42)
",Yes,5,13.0
"from sklearn.ensemble import RandomForestRegressor

rf_count=RandomForestRegressor()
rf_count.fit(X_train,y_train)

importance_count=pd.DataFrame(rf_count.feature_importances_ , index=X.columns, columns=['count']).sort_values(by='count',ascending=False)

",Yes,4,7.0
"importance_count.plot(kind='bar',color='r').set_title('Importance of features for total demand')",No,5,79.0
"#repeat for casual demand:

X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_casual, test_size=0.25, random_state=42)

rf_casual=RandomForestRegressor()
rf_casual.fit(X_train,y_train)

importance_casual=pd.DataFrame(rf_casual.feature_importances_ , index=X.columns, columns=['casual']).sort_values(by='casual',ascending=False)
",Yes,4,7.0
importance_casual.plot(kind='bar').set_title('Importance of features for casual demand'),No,5,79.0
"#repeat for registered demand:

X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_reg, test_size=0.25, random_state=42)

rf_reg=RandomForestRegressor()
rf_reg.fit(X_train,y_train)

importance_reg=pd.DataFrame(rf_reg.feature_importances_ , index=X.columns, columns=['reg']).sort_values(by='reg',ascending=False)
",Yes,4,7.0
"importance_reg.plot(kind='bar',color='g').set_title('Importance of features for registered demand')",No,5,79.0
"importance_df=pd.concat([importance_count,importance_casual,importance_reg],axis=1)
importance_df.plot(kind='bar').set_title('Feature importance for each kind of demand')",No,4,79.0
"#Prepare Training data
X_train=train2[feature_selection]
print(X_train.shape)

y_train=train2['count']
print(y_train.shape)",No,4,14.0
"#Prepare Test data

#parse string datetime into datetime format
test['datetime2']=test.datetime.apply(lambda x: parser.parse(x))

#Get some different time variables
test['year']=test.datetime2.apply(lambda x: x.year)
test['month']=test.datetime2.apply(lambda x: x.month)
test['weekday']=test.datetime2.apply(lambda x: x.weekday())
test['hour']=test.datetime2.apply(lambda x: x.hour)

X_test=test[feature_selection]
print(X_test.shape)",Yes,4,8.0
"X_train_scaled=StandardScaler().fit_transform(X=X_train)
X_test_scaled=StandardScaler().fit_transform(X=X_test)",No,5,18.0
"from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import make_scorer


def rmsle(y,y_pred):
    return np.sqrt(mean_squared_log_error(y,y_pred))
    
rmsle_score=make_scorer(rmsle)",Yes,4,49.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rfr=RandomForestRegressor(random_state=42)

score=cross_val_score(rfr,X_train_scaled,y_train,cv=15,scoring=rmsle_score)

print(f'Score rmsle mean: {np.round(score.mean(),4)}')
print(f'Score  rmsle std: {np.round(score.std(),4)}')",Yes,5,28.0
"rfr.fit(X_train_scaled,y_train)
y_pred=rfr.predict(X_test_scaled)",Yes,4,7.0
"submission=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
submission['count']=y_pred
submission.to_csv('submissionI.csv',index=False)",Yes,4,25.0
"#Without Scaling Data

rfr.fit(X_train,y_train)
y_pred=rfr.predict(X_test)
submission2=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
submission2['count']=y_pred
submission2.to_csv('submissionII.csv',index=False)               ",Yes,4,7.0
"from sklearn.model_selection import GridSearchCV, train_test_split


x_train2,x_test2,y_train2,y_test2=train_test_split(X_train,y_train,test_size=0.25,random_state=42)

params={'n_estimators': [10,50,100,300,500],
       'n_jobs':[-1],
       'max_features':['auto','sqrt','log2'],
       'random_state':[42]}

rfr_tuned=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,scoring='neg_mean_squared_log_error',verbose=True)

rfr_tuned.fit(x_train2,y_train2)
print(rfr_tuned.best_params_)
print(rfr_tuned.best_estimator_)

",Yes,4,6.0
"from sklearn.ensemble import RandomForestRegressor

rfr_final=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

rfr_final.fit(x_train2,y_train2)
y_pred2=rfr_final.predict(x_test2)
print('RMSLE:',np.round(rmsle(y_test2,y_pred2),4))",Yes,3,7.0
"rfr_final=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

rfr_final.fit(X_train,y_train)
y_pred=rfr.predict(X_test)
submission3=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
submission3['count']=y_pred
submission3.to_csv('submissionIII.csv',index=False)",Yes,3,7.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
from sklearn.ensemble import RandomForestRegressor

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,5,88.0
"train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"",parse_dates=[""datetime""])
train.head()",Yes,4,45.0
"test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"",parse_dates=[""datetime""])
test.head()",Yes,4,45.0
"train.info()
test.info()",No,5,40.0
"train[""year""] = train[""datetime""].dt.year
train[""month""] = train[""datetime""].dt.month
train[""hour""] = train[""datetime""].dt.hour
train[""dayofweek""] = train[""datetime""].dt.dayofweek
train.shape",Yes,5,8.0
"test[""year""] = test[""datetime""].dt.year
test[""month""] = test[""datetime""].dt.month
test[""hour""] = test[""datetime""].dt.hour
test[""dayofweek""] = test[""datetime""].dt.dayofweek
test.shape",Yes,4,8.0
"for var in categorical_feature:
    train[var] = train[var].astype(""category"")
    test[var] = test[var].astype(""category"")
train.info()",Yes,5,16.0
"X_train = train[feature]
X_test = test[feature]
X_train.head()",Yes,3,21.0
"Y_train = train[""count""]
Y_train.head()",Yes,3,41.0
"model = RandomForestRegressor(n_estimators=500)

Y_train_log = np.log1p(Y_train)
model.fit(X_train,Y_train_log)

result = model.predict(X_test)",Yes,4,7.0
"sub = pd.read_csv(""/kaggle/input/bike-sharing-demand/sampleSubmission.csv"")
sub.head()",Yes,4,45.0
"sub[""count""] = np.exp(result)
sub.head()",Yes,5,55.0
"sub.to_csv(""20_03_29sub.csv"",index=False)",No,5,25.0
"import calendar
import seaborn as sb
import xgboost as xgb
import plotly.express as px
import pandas_profiling as pp
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error,make_scorer
from sklearn.model_selection import train_test_split,GridSearchCV",No,5,22.0
"#Reading the file
df_train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")",No,5,45.0
pp.ProfileReport(df_train),No,5,40.0
df_train.isnull().sum(axis=0),No,5,39.0
"corr = df_train[['temp','atemp','humidity', 'windspeed','casual', 'registered',
                 'count']].corr()
f,axes = plt.subplots(1,1,figsize = (8,8))
sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 1.4,ax = axes)",No,5,80.0
"y = ['casual','registered','count']
list_continuous = ['temp','atemp','humidity','windspeed']
n=3
s= 15
f,axes = plt.subplots(4,3,figsize = (s,s))
counter = 0
for i in list_continuous:
    for j in y:
        sb.lineplot(x = i , y = j , data  = df_train, ax = axes[counter//n][counter%n])
        counter+=1",No,4,33.0
"df_train['Date'] = pd.DatetimeIndex(df_train['datetime']).date
df_train['Hour'] = pd.DatetimeIndex(df_train['datetime']).hour
df_train['Day'] = pd.DatetimeIndex(df_train['datetime']).day
df_train['Month'] = pd.DatetimeIndex(df_train['datetime']).month
df_train['Year'] = pd.DatetimeIndex(df_train['datetime']).year
df_train['Weekday'] = pd.DatetimeIndex(df_train['datetime']).weekday_name",No,5,8.0
"a = []
for i in df_train.index:
    a.append('Total Count : '+str(df_train['count'][i]))
df_train['count_vis'] = a",No,2,12.0
"fig = px.line(x = 'Date', y = ""count"", data_frame = df_train,color = 'Hour',
              range_y = (0,1150),hover_data = ['Hour','Date','casual','registered'],
              title = 'Interactive LinePlot of the whole dataset(Hover for more details)',
              hover_name = 'count_vis', text = None,height = 670,width = 980)
fig.show()'",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'season'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'holiday'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'workingday'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'weather'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] )
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Hour'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,75.0
df_train.groupby('Weekday').count().index,No,3,60.0
"df_train_temp = df_train.groupby(['Hour','Weekday']).mean().reset_index()
dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday',
                  'Sunday']}
dic1 = {'registered':'Average count of registered poeple commuting.',
        'count': 'Average people commuting','Hour':'Hour of the day',
        'Weekday':'Day of the week'}
fig = px.line(x = 'Hour', y = ""registered"", data_frame = df_train_temp.reset_index(),
              color = 'Weekday',hover_data = ['count'],category_orders = dic,
              title = 'Interactive LinePlot of the registered separated by weekday(Hover for more details)',
              labels = dic1,range_y = [0,550],height = 670,width = 980)
fig.show()'",No,5,75.0
"df_train_temp = df_train.groupby(['Hour','Weekday']).mean().reset_index()
dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday',
                  'Sunday']}
dic1 = {'casual':'Average count of casual poeple commuting.',
        'count': 'Average people commuting','Hour':'Hour of the day',
        'Weekday':'Day of the week'}
fig = px.line(x = 'Hour', y = ""casual"", data_frame = df_train_temp.reset_index(),
              color = 'Weekday',hover_data = ['count'],category_orders = dic,
              title = 'Interactive LinePlot of the casual separated by weekday(Hover for more details)',
              labels = dic1,range_y = [0,550],height = 670,width = 980)
fig.show()'",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Day'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Month'
#order = ['January','February','March','April','May','June','July','August','September','October','November','December']
plot = sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Year'

sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0
df_train.describe(),No,5,40.0
"for i in df_train.groupby('season').count().index:
    s = 's'+str(i)
    a=[]
    for j in df_train.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    df_train[s]=a
df_train.sample(5)",Yes,3,12.0
"for i in df_train.groupby('weather').count().index:
    s = 'w'+str(i)
    a=[]
    for j in df_train.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    df_train[s]=a
df_train.sample(5)",Yes,3,12.0
"for i in df_train.groupby('Hour').count().index:
    s = 'Hour'+str(i)
    a=[]
    for j in df_train.Hour:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    df_train[s]=a
df_train.sample(5)",Yes,3,12.0
"for i in df_train.groupby(""Month"").count().index:
    s = 'Month' + str(i)
    a = []
    for j in df_train.Month:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    df_train[s] = a
df_train.sample(5)'",Yes,3,12.0
"df_train = df_train[['Hour0', 'Hour1', 'Hour2', 'Hour3', 'Hour4', 'Hour5',
       'Hour6', 'Hour7', 'Hour8', 'Hour9', 'Hour10', 'Hour11', 'Hour12',
       'Hour13', 'Hour14', 'Hour15', 'Hour16', 'Hour17', 'Hour18', 'Hour19',
       'Hour20', 'Hour21', 'Hour22', 'Hour23','Month1', 'Month2', 'Month3',
       'Month4', 'Month5', 'Month6', 'Month7', 'Month8', 'Month9', 'Month10',
       'Month11', 'Month12','Year','s1','s2','s3','s4','holiday','workingday',
        'w1','w2','w3','w4','temp','humidity','casual','registered']]",No,5,10.0
"df_train_x = df_train.drop('casual',axis = 1).drop('registered',axis=1)
df_train_x.describe()",Yes,4,10.0
"df_reg_train_y = df_train['registered']
df_reg_train_y.describe",Yes,5,40.0
"df_cas_train_y = df_train['casual']
df_cas_train_y.describe",Yes,4,40.0
"x1_train, x1_test, y1_train, y1_test = train_test_split(df_train_x, df_reg_train_y,
                                                        test_size=0.15, random_state=42)
x2_train, x2_test, y2_train, y2_test = train_test_split(df_train_x, df_cas_train_y,
                                                        test_size=0.15, random_state=42)",No,5,13.0
"poly = PolynomialFeatures(degree=2)
poly_x1_train = poly.fit_transform(x1_train)
poly_x1_test = poly.fit_transform(x1_test)
poly_x2_train = poly.fit_transform(x2_train)
poly_x2_test = poly.fit_transform(x2_test)",No,5,8.0
"rf = RandomForestRegressor()
xg = xgb.XGBRegressor()
parameter = {""max_depth"": [1,2,3,4,5,6],
             ""eta"": [0.01,0.03,0.05],
             ""alpha"":[0],'n_estimators': [100,500,800,1000,1200,1400]}

parameters = {'n_estimators':[50,100,150,200,250],
              'min_impurity_decrease':[0.0,0.001,0.01],
              'max_depth':[20,40,60,80,100]}

models = ['Normal Linear Regression: ','Linear Regression over polynomial: ',
          'Random Forest Regressor: ','XG Boosting: ']'",Yes,4,5.0
"def custom_scorer(y_true,y_pred):
    for i in range(len(y_pred)):
        if y_pred[i]<0:
            y_pred[i] = 1
    return np.sqrt(mean_squared_log_error(y_true, y_pred ))
scorer = make_scorer(custom_scorer,greater_is_better = False)",No,5,84.0
"predict = []
reg = LinearRegression().fit(x1_train, y1_train)
pre_reg = reg.predict(x1_test)

reg_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg_poly = reg_poly.predict(poly_x1_test)

rf_reg = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_reg.fit(x1_train, y1_train)
pre_rf_reg = rf_reg.predict(x1_test)

xg_reg = GridSearchCV(xg,parameter,cv=5,verbose = 2 , scoring = scorer, n_jobs = -1)
xg_reg.fit(x1_train, y1_train)
pre_xg_reg = xg_reg.predict(x1_test)

predict.append(pre_reg)
predict.append(pre_reg_poly)
predict.append(pre_rf_reg)
predict.append(pre_xg_reg)",Yes,3,7.0
"for prediction in range(len(predict)):
    pre = []
    for p in predict[prediction]:
        if p < 1:
            pre.append(1)
        else:
            pre.append(p)
    print(models[prediction]+str(np.sqrt(mean_squared_log_error(y1_test, pre ))))",No,3,49.0
"predict = []
cas = LinearRegression().fit(x2_train, y2_train)
pre_cas = cas.predict(x2_test)

cas_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_cas_poly = cas_poly.predict(poly_x2_test)

rf_cas = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_cas.fit(x2_train, y2_train)
pre_rf_cas = rf_cas.predict(x2_test)

xg_cas = GridSearchCV(xg,parameter,cv=5,verbose = 2 , scoring = scorer, n_jobs = -1)
xg_cas.fit(x2_train, y2_train)
pre_xg_cas = xg_cas.predict(x2_test)

predict.append(pre_cas)
predict.append(pre_cas_poly)
predict.append(pre_rf_cas)
predict.append(pre_xg_cas)",Yes,3,7.0
"for prediction in range(len(predict)):
    pre = []
    for p in predict[prediction]:
        if p < 1:
            pre.append(1)
        else:
            pre.append(p)
    print(models[prediction]+str(np.sqrt(mean_squared_log_error(y2_test, pre ))))",No,3,49.0
"print(""For Random Forest Model: "")
print(""\\t Best Parametres for registered are: "",end='')
print(rf_reg.best_params_)
print(""\\t Best Parametres for casual are: "",end = '')
print(rf_cas.best_params_)
print(""\
For XGBoost Model: "")
print(""\\t Best Parametres for registered are: "",end='')
print(xg_reg.best_params_)
print(""\\t Best Parametres for casual are: "",end = '')
print(xg_cas.best_params_)'",No,2,2.0
"predict1 = []

reg1 = LinearRegression().fit(x1_train, y1_train)
pre_reg1 = reg1.predict(x1_test)

reg1_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg1_poly = reg1_poly.predict(poly_x1_test)

rf1 = RandomForestRegressor(n_estimators = 250,min_impurity_decrease = 0.001,
                            max_depth=60).fit(x1_train, y1_train)
pre_rf1 = rf1.predict(x1_test)

xg1 = xgb.XGBRegressor(alpha = 0, eta = 0.03, n_estimators = 1200, 
                       max_depth = 6).fit(x1_train,y1_train)
pre_xg1 = xg1.predict(x1_test)

for i in range(pre_reg1.size):
    if pre_reg1[i]<1:
        pre_reg1[i] = 1 
    if pre_reg1_poly[i]<1:
        pre_reg1_poly[i] = 1
    if pre_rf1[i]<1:
        pre_rf1[i] = 1
    if pre_xg1[i]<1:
        pre_xg1[i] = 1

predict1.append(pre_reg1)
predict1.append(pre_reg1_poly)
predict1.append(pre_rf1)
predict1.append(pre_xg1)

x1_final = x1_test.copy()
x1_final['Output'] = y1_test
x1_final['Linear'] = pre_reg1
x1_final['Lin_poly'] = pre_reg1_poly
x1_final['RF'] = pre_rf1
x1_final['XG'] = pre_xg1
x1_final['Resid'] = y1_test-pre_reg1
x1_final['Resid_poly'] = y1_test-pre_reg1_poly
x1_final['Resid_rf'] = y1_test - pre_rf1
x1_final['Resid_xg'] = y1_test - pre_xg1

for prediction in range(len(predict1)):
    print(models[prediction]+
          str(np.sqrt(mean_squared_log_error(y1_test,predict1[prediction] ))))",Yes,2,7.0
"predict2 = []

reg2 = LinearRegression().fit(x2_train, y2_train)
pre_reg2 = reg2.predict(x2_test)

reg2_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_reg2_poly = reg2_poly.predict(poly_x2_test)

rf2 = RandomForestRegressor(n_estimators = 100,min_impurity_decrease = 0.001,
                            max_depth=40).fit(x2_train, y2_train)
pre_rf2 = rf2.predict(x2_test)

xg2 = xgb.XGBRegressor(alpha = 0, eta = 0.05, n_estimators = 800,
                       max_depth = 6).fit(x2_train,y2_train)
pre_xg2 = xg2.predict(x2_test)

for i in range(pre_reg2.size):
    if pre_reg2[i]<1:
        pre_reg2[i] = 1 
    if pre_reg2_poly[i]<1:
        pre_reg2_poly[i] = 1
    if pre_rf2[i]<1:
        pre_rf2[i] = 1
    if pre_xg2[i]<1:
        pre_xg2[i] = 1

predict2.append(pre_reg2)
predict2.append(pre_reg2_poly)
predict2.append(pre_rf2)
predict2.append(pre_xg2)

x2_final = x2_test.copy()
x2_final['Output'] = y2_test
x2_final['Linear'] = pre_reg2
x2_final['Lin_poly'] = pre_reg2_poly
x2_final['RF'] = pre_rf2
x2_final['XG'] = pre_xg2
x2_final['Resid'] = y2_test-pre_reg2
x2_final['Resid_poly'] = y2_test-pre_reg2_poly
x2_final['Resid_rf'] = y2_test - pre_rf2
x2_final['Resid_xg'] = y2_test - pre_xg2

for prediction in range(len(predict2)):
    print(models[prediction]+
          str(np.sqrt(mean_squared_log_error(y2_test, predict2[prediction]))))",No,2,7.0
"name1 = ['Residual for casual without polynomial features'] *1633
name2 = ['Residual for casual with polynomial features'] *1633
name3 = ['Residual for registered without polynomial features'] *1633
name4 = ['Residual for registered with polynomial features'] *1633
dic = {'Lin': 'Output Predicted using linear model',
       'Lin_poly': 'Output Predicted using polynomial features',
       'RF' : 'Output Predicted using RandomForest Model', 
       'XG': 'Output Predicted using XGBoost Model',
       'Resid':'Deviation from predicted','Output':'Expected Output',
       'Resid_poly':'Deviation from predicted','Resid_rf':'Deviation from predicted',
       'Output':'Expected Output','Resid_xg':'Deviation from predicted'}
fig1 = px.scatter(data_frame = x1_final,x = 'Linear', y = 'Resid',hover_data = ['Output'],
                  labels = dic,hover_name = name3,color_discrete_sequence = ['red'])
fig2 = px.scatter(data_frame = x1_final,x = 'Lin_poly', y = 'Resid_poly',
                  hover_data = ['Output'],labels = dic,hover_name = name4,
                  color_discrete_sequence = ['blue'])
fig3 = px.scatter(data_frame = x2_final,x = 'Linear', y = 'Resid',hover_data = ['Output'],
                  labels = dic,hover_name = name1,color_discrete_sequence = ['darkgreen'])
fig4 = px.scatter(data_frame = x2_final,x = 'Lin_poly', y = 'Resid_poly',
                  hover_data = ['Output'],labels = dic,hover_name = name2,
                  color_discrete_sequence = ['gold'])

trace1 = fig1['data'][0]
trace2 = fig2['data'][0]
trace3 = fig3['data'][0]
trace4 = fig4['data'][0]


fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing  = 0.2,
                    row_titles = ['Linear Model','Polynomial Model'],
                    column_titles = ['Casual','Registered'],
                    x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)')

fig.add_trace(trace3, row=1, col=1)
fig.add_trace(trace4, row=2, col=1)
fig.add_trace(trace1, row=1, col=2)
fig.add_trace(trace2, row=2, col=2)

fig.show()",No,5,56.0
"name5 = ['Residual for casual using RandomForest Model'] *1633
name6 = ['Residual for casual using XGBoost Model'] *1633
name7 = ['Residual for registered using RandomForest Model'] *1633
name8 = ['Residual for registered using XGBoost Model'] *1633

dic = {'Lin': 'Output Predicted using linear model',
       'Lin_poly': 'Output Predicted using polynomial features',
       'RF' : 'Output Predicted using RandomForest Model',
       'XG': 'Output Predicted using XGBoost Model',
       'Resid':'Deviation from predicted','Output':'Expected Output',
       'Resid_poly':'Deviation from predicted','Resid_rf':'Deviation from predicted',
       'Output':'Expected Output','Resid_xg':'Deviation from predicted'}

fig5 = px.scatter(data_frame = x1_final,x = 'RF', y = 'Resid_rf',hover_data = ['Output'],
                  labels = dic,hover_name = name7,color_discrete_sequence = ['red'])
fig6 = px.scatter(data_frame = x1_final,x = 'XG', y = 'Resid_xg',hover_data = ['Output'],
                  labels = dic,hover_name = name8,color_discrete_sequence = ['blue'])
fig7 = px.scatter(data_frame = x2_final,x = 'RF', y = 'Resid_rf',hover_data = ['Output'],
                  labels = dic,hover_name = name5,color_discrete_sequence = ['darkgreen'])
fig8 = px.scatter(data_frame = x2_final,x = 'XG', y = 'Resid_xg',hover_data = ['Output'],
                  labels = dic,hover_name = name6,color_discrete_sequence = ['gold'])

trace5 = fig5['data'][0]
trace6 = fig6['data'][0]
trace7 = fig7['data'][0]
trace8 = fig8['data'][0]

fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing  = 0.2,
                    row_titles = ['Random Forest','XGBoost'],
                    column_titles = ['Casual','Registered'],
                    x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)')

fig.add_trace(trace5, row=1, col=2)
fig.add_trace(trace6, row=2, col=2)
fig.add_trace(trace7, row=1, col=1)
fig.add_trace(trace8, row=2, col=1)
fig.show()",No,5,56.0
"rf1 = RandomForestRegressor(n_estimators = 200,min_impurity_decrease = 0.001,
                            max_depth=80).fit(df_train_x,df_reg_train_y)
xg2 = xgb.XGBRegressor(alpha = 0, eta = 0.05, max_depth = 6,
                       n_estimators = 800).fit(df_train_x,df_cas_train_y)",No,5,7.0
df_test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv'),No,5,45.0
"test=df_test
test.describe()",No,5,40.0
"test['mth'] = pd.DatetimeIndex(test['datetime']).month
test['Year'] = pd.DatetimeIndex(test['datetime']).year
test['dy'] = pd.DatetimeIndex(test['datetime']).day
test['hr'] = pd.DatetimeIndex(test['datetime']).hour

for i in test.groupby(""season"").count().index:
    s = 's' + str(i)
    a = []
    for j in test.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby(""weather"").count().index:
    s = 'w' + str(i)
    a = []
    for j in test.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby('hr').count().index:
    s = 'Hour'+str(i)
    a=[]
    for j in test.hr:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s]=a
for i in test.groupby(""mth"").count().index:
    s = 'Month' + str(i)
    a = []
    for j in test.mth:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
test.sample(10)'",Yes,5,8.0
"test = test[['Hour0','Hour1','Hour2','Hour3','Hour4','Hour5','Hour6','Hour7','Hour8',
             'Hour9','Hour10','Hour11','Hour12','Hour13','Hour14','Hour15','Hour16',
             'Hour17','Hour18','Hour19','Hour20','Hour21','Hour22','Hour23','Month1',
             'Month2','Month3','Month4','Month5','Month6','Month7','Month8','Month9',
             'Month10','Month11','Month12','Year','s1','s2','s3','s4','holiday',
             'workingday','w1','w2', 'w3','w4','temp','humidity']]
test.describe",Yes,5,40.0
"pre_reg = rf1.predict(test)
pre_cas = xg2.predict(test)

final_predictions = pd.DataFrame(pre_cas+pre_reg,columns = ['cout'])

final_predictions.describe",Yes,4,48.0
"s=[]
for j in final_predictions.cout:
    if int(j)<1:
        s.append(1)
    else:
        s.append(j)
final_predictions['count'] = s ",No,2,78.0
final_predictions.describe,No,5,40.0
"final_predictions['datetime']=df_test['datetime']
final_predictions = final_predictions[['datetime','count']]",No,5,55.0
final_predictions.describe(),No,5,40.0
"final_predictions.to_csv('submission.csv',index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings(""ignore"", category=FutureWarning)
warnings.filterwarnings(""ignore"")",No,5,23.0
"train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")
test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"")",No,5,45.0
target = train['count'],No,3,12.0
"from scipy import stats
from scipy.stats import norm",No,5,22.0
"sns.distplot(train['count'],fit=norm)",No,5,33.0
"train[""log_count""] = np.log(target+1)",No,5,8.0
"sns.distplot(train[""log_count""], fit=norm)",No,5,33.0
"feature_names=list(test)
df_train=train[feature_names]
df=pd.concat((df_train, test))",No,4,11.0
"print(train.shape, test.shape, df.shape)",No,5,58.0
import datetime,No,5,22.0
tmp = pd.to_datetime(train['datetime']),No,5,16.0
"df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['weekend'] = (df['dayofweek'] ==5) | (df['dayofweek'] == 6)",No,5,8.0
"train['datetime'] = pd.to_datetime(train['datetime'])
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek
train['month'] = train['datetime'].dt.month
train['year'] = train['datetime'].dt.year
train['weekend'] = (train['dayofweek'] ==5) | (train['dayofweek'] == 6)",No,4,8.0
"df.drop(['datetime'], axis=1, inplace=True)",No,5,10.0
"figure, axs = plt.subplots(3,2, figsize = (15,10))

sns.barplot(data=train, x = ""day"", y = target, ax = axs[0][0])
sns.barplot(data=train, x = ""hour"", y = target, ax = axs[0][1])
sns.barplot(data=train, x = ""dayofweek"", y = target, ax = axs[1][0])
sns.barplot(data=train, x = ""weekend"", y = target, ax = axs[1][1])
sns.barplot(data=train, x = ""month"", y = target, ax = axs[2][0])
sns.barplot(data=train, x = ""year"", y = target, ax = axs[2][1])",No,5,75.0
"df=df.drop(columns=['month', 'day'])",No,5,10.0
df,No,5,41.0
"sns.barplot(data=df[:len(train)], x='season', y=target)",No,5,33.0
"season_encoded = pd.get_dummies(df['season'],prefix= 'season')
df = pd.concat((df,season_encoded), axis=1)
df = df.drop(columns = 'season')",Yes,4,20.0
"sns.barplot(data=df[:len(train)], x='holiday', y=target)",No,5,33.0
"sns.barplot(data=df[:len(train)], x='workingday', y=target)",No,5,33.0
"sns.barplot(data=df[:len(train)], x='weather', y=target)
df['weather'] = df['weather']",No,5,33.0
"weather_encoded = pd.get_dummies(df['weather'],prefix= 'weather')
df = pd.concat((df,weather_encoded), axis=1)
df = df.drop(columns = 'weather')",Yes,4,20.0
"fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows = 5)
fig.set_size_inches(20,30)

sns.pointplot(data = train, x = ""hour"", y = ""count"", ax = ax1)
sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""season"", ax = ax2)
sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""holiday"", ax = ax3)
sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""workingday"", ax = ax4)
sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""weather"",  ax = ax5)",No,5,75.0
"from scipy.stats import skew
skew = df.apply(lambda x: skew(x))
skew.sort_values(ascending = False)",Yes,4,40.0
"skew = skew[abs(skew) > 0.5]
skew",No,5,14.0
"cor = train.iloc[:,1:-1].corr()
cor.head()",No,5,40.0
"mask = np.array(cor)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(cor,mask= mask,square=True,annot=True)",No,5,80.0
from statsmodels.stats.outliers_influence import variance_inflation_factor,No,5,22.0
"vif_data = df.iloc[:,:6]
vif_data.info()",Yes,4,40.0
"vif = pd.DataFrame()
vif['Features'] = vif_data.columns
vif['vif'] = [variance_inflation_factor(
             vif_data.values, i) for i in range(vif_data.shape[1])]
vif.sort_values(by='vif',ascending=False)",Yes,4,12.0
"sns.distplot(df['pca'], fit=norm)",No,5,33.0
"fig, [ax1,ax2,ax3] = plt.subplots(1,3)
fig.set_size_inches(12,5)
sns.regplot(train['temp'], 'count', data = train, ax=ax1)
sns.regplot(train['humidity'], 'count', data = train, ax=ax2)
sns.regplot(train['windspeed'], 'count', data = train, ax=ax3)",No,5,33.0
"stats.pearsonr(train['temp'],target)",No,4,47.0
"sns.countplot(data = df, x = ""windspeed"")",No,5,33.0
"df = df.drop(columns=['temp','atemp'])",No,5,10.0
"fig, axes = plt.subplots(nrows=3, ncols=2, figsize = (15,20))
sns.boxplot(data = train, y=""count"", x = ""holiday"", orient = ""v"", ax = axes[0][0])
sns.boxplot(data = train, y=""count"", x = ""workingday"", orient = ""v"", ax = axes[0][1])
sns.boxplot(data = train, y=""count"", x = ""hour"", orient = ""v"", ax = axes[1][0])
sns.boxplot(data = train, y=""count"", x = ""dayofweek"", orient = ""v"", ax = axes[1][1])
sns.boxplot(data = train, y=""count"", x = ""year"", orient = ""v"", ax = axes[2][0])",No,5,33.0
"from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression",No,5,22.0
"new_train = df[:train.shape[0]]
new_test = df[train.shape[0]:]",No,5,13.0
target = train['log_count'],No,5,21.0
"X_train, X_val, y_train, y_val = train_test_split(new_train, target, test_size=0.2, shuffle=True)",No,5,13.0
"def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score",No,5,84.0
"from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)",Yes,5,84.0
import statsmodels.api as sm,No,5,22.0
"lasso = make_pipeline(GridSearchCV(Lasso(random_state=1),param,
                                  cv=10, scoring = RMSLE))",No,5,53.0
"lasso.fit(X_train,y_train)",No,5,7.0
la_yhat = lasso.predict(X_val),No,5,48.0
"s_lasso = rmsle_score(la_yhat,y_val)
s_lasso",No,5,49.0
pred_la = lasso.predict(new_test),No,5,48.0
"param_e = {'alpha' :[0.1,1.0,10], 'max_iter' :[1000000], 'l1_ratio':[0.04,0.05], 
           'normalize':[True,False]}",No,5,5.0
"Enet.fit(X_train,y_train)",No,5,7.0
Enet_yhat = Enet.predict(X_val),No,5,48.0
"s_Enet = rmsle_score(Enet_yhat,y_val)
s_Enet",No,5,49.0
pred_Enet = Enet.predict(new_test),No,5,48.0
"param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [70,100], 'random_state': [5] }",No,5,5.0
"RF = make_pipeline(GridSearchCV(RandomForestRegressor(random_state=1),param_Rf,
                   cv=10, scoring = RMSLE))",No,5,82.0
"RF.fit(X_train,y_train)",No,5,7.0
"RF_yhat = RF.predict(X_val)
s_RF = rmsle_score(RF_yhat,y_val)
s_RF",No,3,48.0
pred_RF = RF.predict(new_test),No,5,48.0
"param_GB = [{'learning_rate': [1,0.1,0.01,0.001],
              'n_estimators': [50, 100, 200, 500, 1000]}]",No,5,5.0
"GB = make_pipeline(GridSearchCV(GradientBoostingRegressor(random_state=1),param_GB,
                   cv=10, scoring = RMSLE))",No,5,4.0
"GB.fit(X_train,y_train)",No,5,7.0
"GB_yhat = GB.predict(X_val)
s_GB = rmsle_score(GB_yhat,y_val)
s_GB",Yes,3,48.0
pred_GB = GB.predict(new_test),No,5,48.0
"param_lgb = param_grid = [{
    'n_estimators': [400, 700, 1000], 
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'min_split_gain': [0.3, 0.4],
}]",No,5,5.0
"lgb = make_pipeline(GridSearchCV(LGBMRegressor(verbose_eval=False,random_state=1),param_lgb,
                    cv=10, scoring = RMSLE))",No,3,4.0
"lgb.fit(X_train,y_train)",No,5,7.0
"lgb_yhat = lgb.predict(X_val)
s_lgb = rmsle_score(lgb_yhat,y_val)
s_lgb",Yes,4,27.0
pred_lgb = lgb.predict(new_test),No,5,48.0
"sns.barplot(x=list_regressors, y=list_scores)
plt.ylabel('RMSE')",No,5,33.0
"df_predictions = pd.DataFrame(data=predictions) 
df_predictions.corr()",Yes,5,40.0
"plt.figure(figsize=(7, 7))
sns.heatmap(df_predictions.corr(),linewidths=1.5,
            annot=True, 
            square=True,          
            yticklabels=df_predictions.columns , 
            xticklabels=df_predictions.columns)
",No,5,80.0
"RF.fit(new_train,target)",No,5,7.0
"log_pred=RF.predict(new_test)
predictions=np.exp(log_pred)-1",No,5,48.0
"sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = predictions
sub.head()",No,5,55.0
"sub.to_csv('submission.csv', index=False)",No,5,25.0
"lgb.fit(new_train,target)",No,5,7.0
"log_pred_lgb=lgb.predict(new_test)
predictions_lgb=np.exp(log_pred_lgb)-1",No,5,48.0
"sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = predictions_lgb
sub.head()",No,3,41.0
"sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = ensemble
sub.head()",No,5,55.0
"import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV
import seaborn as sns
import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"filepath= '/kaggle/input/bike-sharing-demand/train.csv'
filepath1= '/kaggle/input/bike-sharing-demand/test.csv'
testdata= pd.read_csv(filepath1)
testdata2=testdata
traindata= pd.read_csv(filepath)
traindata.head()",Yes,4,45.0
"#We remove the id column in both the training and testing datasets.
traindata=traindata.drop('datetime',axis=1)
testdata=testdata.drop('datetime',axis=1)

#We also remove the casual and registered columns because they are not present in the test dataset
traindata=traindata.drop('casual',axis=1)
traindata=traindata.drop('registered',axis=1)",No,5,10.0
"b""#Checking the correlation between each column with the Cover_Type\ncorr = numeric_features.corr()\nprint (corr['count'].sort_values(ascending=False), '\\n')\nprint (corr['count'].sort_values(ascending=False))\n""",No,4,40.0
"#We see how various features compare with the Cover type

column_names=['temp','atemp','windspeed']                


for i in column_names:
    plt.scatter(x=traindata[i], y=traindata['count'])
    plt.ylabel('count')
    plt.xlabel(i)
    plt.show()",No,5,33.0
"#Lettuce visualize the other columns and see how they relate with counts

col=['season','holiday','workingday','weather']
for i in col:
    
    sns.factorplot(x=i,y=""count"",data=traindata,kind='bar',size=5,aspect=1.5)
'",No,5,33.0
"traindata.temp.unique()
fig,axes=plt.subplots(2,2)
axes[0,0].hist(x=""temp"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[0,0].set_title(""Variation of temp"")
axes[0,1].hist(x=""atemp"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[0,1].set_title(""Variation of atemp"")
axes[1,0].hist(x=""windspeed"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[1,0].set_title(""Variation of windspeed"")
axes[1,1].hist(x=""humidity"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125')
axes[1,1].set_title(""Variation of humidity"")
fig.set_size_inches(10,10)'",No,5,33.0
"#Now we will visualise the remaining features and compare them with the number of rentals
column_names=['season','holiday','workingday','weather']                

for i in column_names:
    feature = traindata.pivot_table(index=i,
                  values='count')
    feature.plot(kind='bar', color='blue')
    plt.xlabel(i)
    plt.ylabel('counts')
    plt.xticks(rotation=0)
    plt.show()",No,5,33.0
"#Split the data into train and test
y=traindata['count']
x=traindata.drop('count',axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",Yes,5,13.0
"
#Linear Regression
linearRegressor = LinearRegression()
linearRegressor.fit(x_train, y_train)
y_predicted = linearRegressor.predict(x_test)
mse = mean_squared_error(y_test, y_predicted)
r = r2_score(y_test, y_predicted)
mae = mean_absolute_error(y_test,y_predicted)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)",Yes,3,7.0
"#for random forest regresion.  (tuning)
no_of_test=[500]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(x_train,y_train)
pred=clf_rf.predict(x_test)
mse = mean_squared_error(y_test, pred)
r = r2_score(y_test, pred)
mae = mean_absolute_error(y_test,pred)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)'",Yes,3,6.0
"# for KNN  (tuning)

n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_knn.fit(x_train,y_train)
pred=clf_knn.predict(x_test)
mse = mean_squared_error(y_test, pred)
r = r2_score(y_test, pred)
mae = mean_absolute_error(y_test,pred)
print(""Mean Squared Error:"",mse)
print(""R score:"",r)
print(""Mean Absolute Error:"",mae)'",Yes,3,6.0
"# Thus we can use RandomForest Regresson.

no_of_test=[500]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(x,y)
Prediction=clf_rf.predict(testdata)'",Yes,4,6.0
"predictionlist=Prediction.tolist()
counts=testdata2['datetime'].tolist() 
output=pd.DataFrame(list(zip(counts, predictionlist)),
              columns=['datetime','count'])
output.head()
output.to_csv('my_submission(ikeSharingDemand).csv', index=False)",Yes,5,25.0
"# 

import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 
import seaborn as sns #  
import matplotlib.pyplot as plt
import calendar 
from datetime import datetime

import os
print(os.listdir(""../input""))'",Yes,4,23.0
"b""# \n\ntrain = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')\ntest = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')\n\ntrain.head()""",Yes,4,45.0
"""""""

datetime - hourly date + timestamp  
season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
temp - temperature in Celsius
atemp - ""feels like"" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals
""""""

train.info()'",No,5,40.0
"# 
test.head()'",No,5,41.0
"b""# \n# split -  - \ntrain['tempDate'] = train.datetime.apply(lambda x:x.split())""",No,4,13.0
"# tempDate--yearmonthdayweekday

train['year'] = train.tempDate.apply(lambda x:x[0].split('-')[0])
train['month'] = train.tempDate.apply(lambda x:x[0].split('-')[1])
train['day'] = train.tempDate.apply(lambda x:x[0].split('-')[2])

#weekdaycalendardatetime
train['weekday'] = train.tempDate.apply(lambda x:calendar.day_name[datetime.strptime(x[0],""%Y-%m-%d"").weekday()])

train['hour'] = train.tempDate.apply(lambda x:x[1].split(':')[0])'",No,4,13.0
"b""# \n\ntrain['year'] = pd.to_numeric(train.year,errors='coerce')\ntrain['month'] = pd.to_numeric(train.month,errors='coerce')\ntrain['day'] = pd.to_numeric(train.day,errors='coerce')\ntrain['hour'] = pd.to_numeric(train.hour,errors='coerce')""",No,5,16.0
"# 

train.info()'",No,5,40.0
"b""# tempDate\n\ntrain = train.drop('tempDate',axis=1)""",No,5,10.0
"b""# count\n\n#year - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.barplot(x='year',y='count',data=train.groupby('year')['count'].mean().reset_index())\n\n#month - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.barplot(x='month',y='count',data=train.groupby('month')['count'].mean().reset_index())\n\n#day - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.barplot(x='day',y='count',data=train.groupby('day')['count'].mean().reset_index())\n\n#hour - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.barplot(x='hour',y='count',data=train.groupby('hour')['count'].mean().reset_index())""",No,5,75.0
"#season - count
fig = plt.figure(figsize=[12,10])
ax1 = fig.add_subplot(2,2,1)
ax1 = sns.barplot(x='season',y='count',data=train.groupby('season')['count'].mean().reset_index())

#holiday - count
ax2 = fig.add_subplot(2,2,2)
ax2 = sns.barplot(x='holiday',y='count',data=train.groupby('holiday')['count'].mean().reset_index())

#workingday - count
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.barplot(x='workingday',y='count',data=train.groupby('workingday')['count'].mean().reset_index())

#weather - count
ax4 = fig.add_subplot(2,2,4)
ax4 = sns.barplot(x='weather',y='count',data=train.groupby('weather')['count'].mean().reset_index())",No,5,33.0
"def badToRight(month):
    if month in [12,1,2]:
        return 4
    elif month in [3,4,5]:
        return 1
    elif month in [6,7,8]:
        return 2
    elif month in [9,10,11]:
        return 3
    
train['season'] = train.month.apply(badToRight)",No,5,8.0
"b""# 1\n\n#season - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.barplot(x='season',y='count',data=train.groupby('season')['count'].mean().reset_index())\n\n#holiday - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.barplot(x='holiday',y='count',data=train.groupby('holiday')['count'].mean().reset_index())\n\n#woikingday - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.barplot(x='workingday',y='count',data=train.groupby('workingday')['count'].mean().reset_index())\n\n#weather - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.barplot(x='weather',y='count',data=train.groupby('weather')['count'].mean().reset_index())""",No,5,33.0
"# heatmap

fig = plt.figure(figsize=[20,20])
ax = sns.heatmap(train.corr(),annot=True,square=True)'",No,5,80.0
"b""# heatmapcount\n\n#hour season - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.pointplot(x='hour',y='count',hue='season',data=train.groupby(['season','hour'])['count'].mean().reset_index())\n\n#hour holiday - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.pointplot(x='hour',y='count',hue='holiday',data=train.groupby(['holiday','hour'])['count'].mean().reset_index())\n\n#hour weekday - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.pointplot(x='hour',y='count',hue='weekday',hue_order=['Sunday','Monday','Tuesday','Wendnesday','Thursday','Friday','Saturday'],data=train.groupby(['weekday','hour'])['count'].mean().reset_index())\n\n#hour weather - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.pointplot(x='hour',y='count',hue='weather',data=train.groupby(['weather','hour'])['count'].mean().reset_index())""",No,5,75.0
"# 

train[train.weather==4]'",No,5,14.0
"#month, weather - count 
fig = plt.figure(figsize=[12,10])
ax1 = fig.add_subplot(2,1,1)
ax1 = sns.pointplot(x='month',y='count',hue='weather',data=train.groupby(['weather','month'])['count'].mean().reset_index())

#month count
ax2 = fig.add_subplot(2,1,2)
ax2 = sns.barplot(x='month',y='count',data=train.groupby('month')['count'].mean().reset_index())",No,5,75.0
"""""""
WindspeedWindspeed0
0or0
windspeed
""""""

# 

train['weekday']= train.weekday.astype('category')
print(train['weekday'].cat.categories)'",Yes,4,16.0
"b""# windspeed\n\nfrom sklearn.ensemble import RandomForestRegressor\n\n# Windspeed0\nwindspeed_0 = train[train.windspeed == 0]\n# Windspeed0\nwindspeed_Not0 = train[train.windspeed != 0]\n\n# Windspeed0\nwindspeed_0_df = windspeed_0.drop(['windspeed','casual','registered','count','datetime'],axis=1)\n\n# Windspeed0\nwindspeed_Not0_df = windspeed_Not0.drop(['windspeed','casual','registered','count','datetime'],axis=1)\nwindspeed_Not0_series = windspeed_Not0['windspeed'] \n\n# 0\nrf = RandomForestRegressor()\nrf.fit(windspeed_Not0_df,windspeed_Not0_series)\n\n# Windspeed0Windspeed\npredicted_windspeed_0 = rf.predict(windspeed_0_df)\n\n# \nwindspeed_0['windspeed'] = predicted_windspeed_0""",Yes,3,7.0
"# 
train = pd.concat([windspeed_0,windspeed_Not0],axis=0)'",No,5,11.0
"b""# string typedatetime\ntrain.datetime = pd.to_datetime(train.datetime,errors='coerce')""",No,5,16.0
"b""# datetime\ntrain = train.sort_values(by=['datetime'])""",No,5,9.0
"# windspeed
fig = plt.figure(figsize=[20,20])
ax = sns.heatmap(train.corr(),annot=True,square=True)'",No,5,80.0
"fig = plt.figure(figsize=[5,5])
sns.distplot(train['windspeed'],bins=np.linspace(train['windspeed'].min(),train['windspeed'].max(),10))
plt.suptitle(""Filled by Random Forest Regressor"")
print(""Min value of windspeed is {}"".format(train['windspeed'].min()))'",No,5,33.0
"b""# testtrain\n\ntrain = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')\ntest = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')""",No,5,45.0
"combine = pd.concat([train,test],axis=0)
combine.info()",Yes,4,11.0
"combine['tempDate'] = combine.datetime.apply(lambda x:x.split())
combine['weekday'] = combine.tempDate.apply(lambda x: calendar.day_name[datetime.strptime(x[0],""%Y-%m-%d"").weekday()])
combine['year'] = combine.tempDate.apply(lambda x: x[0].split('-')[0])
combine['month'] = combine.tempDate.apply(lambda x: x[0].split('-')[1])
combine['day'] = combine.tempDate.apply(lambda x: x[0].split('-')[2])
combine['hour'] = combine.tempDate.apply(lambda x: x[1].split(':')[0])'",No,4,13.0
"combine['year'] = pd.to_numeric(combine.year,errors='coerce')
combine['month'] = pd.to_numeric(combine.month,errors='coerce')
combine['day'] = pd.to_numeric(combine.day,errors='coerce')
combine['hour'] = pd.to_numeric(combine.hour,errors='coerce')",No,5,16.0
combine.info(),No,5,40.0
combine['season'] = combine.month.apply(badToRight),No,5,8.0
"combine.weekday = combine.weekday.astype('category')
combine.weekday.cat.categories = ['5','1','6','0','4','2','3']
dataWind0 = combine[combine['windspeed']==0]
dataWindNot0 = combine[combine['windspeed']!=0]

dataWind0.columns",Yes,3,14.0
"dataWind0_df = dataWind0.drop(['windspeed','casual','registered','count','datetime','tempDate'],axis=1)

dataWindNot0_df = dataWindNot0.drop(['windspeed','casual','registered','count','datetime','tempDate'],axis=1)
dataWindNot0_series = dataWindNot0['windspeed']

dataWindNot0_df.head()",Yes,4,10.0
"rf2 = RandomForestRegressor()
rf2.fit(dataWindNot0_df,dataWindNot0_series)
predicted = rf2.predict(dataWind0_df)
print(predicted)",Yes,4,7.0
"dataWind0['windspeed'] = predicted
combine = pd.concat([dataWind0,dataWindNot0],axis=0)",No,5,11.0
"b""#\nfor col in categorizational_columns:\n    combine[col] = combine[col].astype('category')""",No,5,16.0
"b""# countdatetime\ntrain = combine[pd.notnull(combine['count'])].sort_values(by='datetime')\ntest = combine[~pd.notnull(combine['count'])].sort_values(by='datetime')\n\n# \ndatetimecol = test['datetime']\nyLabels = train['count'] #count\nyLabelsRegistered = train['registered'] #\nyLabelsCasual = train['casual'] #""",Yes,3,21.0
"# columntraintest
train = train.drop(drop_columns,axis=1)
test = test.drop(drop_columns,axis=1)'",No,5,10.0
"""""""
RMSLE
RMSLE
https://programmers.co.kr/learn/courses/21/lessons/943#

RMSLE


0
""""""

# y is predict value y_ is actual value
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y), 
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))'",No,5,84.0
"# 
# attr
from sklearn.linear_model import LinearRegression,Ridge,Lasso


lr = LinearRegression()

""""""
yLabelsnp.lognp.log1p
np.log1pnp.log1+ xx0log - np.log1p
""""""
yLabelslog = np.log1p(yLabels)
#
lr.fit(train,yLabelslog)
#
preds = lr.predict(train)
#rmsleelementnp.exppredsloglog
print('RMSLE Value For Linear Regression: {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))'",Yes,3,7.0
"""""""
GridSearchCV

GridSearchCV
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
https://datascienceschool.net/view-notebook/ff4b5d491cc34f94aea04baca86fbef8/
""""""
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

#RidgeL2alpha
ridge = Ridge()

#Ridge
ridge_params = {'max_iter':[3000],'alpha':[0.001,0.01,0.1,1,10,100,1000]}
rmsle_scorer = metrics.make_scorer(rmsle,greater_is_better=False)
grid_ridge = GridSearchCV(ridge,ridge_params,scoring=rmsle_scorer,cv=5)

grid_ridge.fit(train,yLabelslog)
preds = grid_ridge.predict(train)
print(grid_ridge.best_params_)
print('RMSLE Value for Ridge Regression {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))'",Yes,3,6.0
"#GridSearchCVgrid_ridgecv_result_alpha
df = pd.DataFrame(grid_ridge.cv_results_)'",No,5,12.0
"b""#RidgeL1alpha\nlasso = Lasso()\n\nlasso_params = {'max_iter':[3000],'alpha':[0.001,0.01,0.1,1,10,100,1000]}\ngrid_lasso = GridSearchCV(lasso,lasso_params,scoring=rmsle_scorer,cv=5)\ngrid_lasso.fit(train,yLabelslog)\npreds = grid_lasso.predict(train)\nprint('RMSLE Value for Lasso Regression {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))""",Yes,3,6.0
"rf = RandomForestRegressor()

rf_params = {'n_estimators':[1,10,100]}
grid_rf = GridSearchCV(rf,rf_params,scoring=rmsle_scorer,cv=5)
grid_rf.fit(train,yLabelslog)
preds = grid_rf.predict(train)
print('RMSLE Value for RandomForest {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))",Yes,3,6.0
"from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
gb_params={'max_depth':range(1,11,1),'n_estimators':[1,10,100]}
grid_gb=GridSearchCV(gb,gb_params,scoring=rmsle_scorer,cv=5)
grid_gb.fit(train,yLabelslog)
preds = grid_gb.predict(train)
print('RMSLE Value for GradientBoosting {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))",Yes,4,6.0
"predsTest = grid_gb.predict(test)
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sns.distplot(yLabels,ax=ax1,bins=50)
sns.distplot(np.exp(predsTest),ax=ax2,bins=50)",No,5,56.0
"submission = pd.DataFrame({
        ""datetime"": datetimecol,
        ""count"": [max(0, x) for x in np.exp(predsTest)]
    })
submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)'",Yes,5,25.0
"import calendar
import seaborn as sb
import xgboost as xgb
import plotly.express as px
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error,make_scorer
from sklearn.model_selection import train_test_split,GridSearchCV",No,5,22.0
"#Reading the file
file = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")",No,5,45.0
file.describe(),No,5,40.0
file.isnull().sum(axis=0),No,5,39.0
file.columns,No,5,71.0
"corr = file[['temp','atemp','humidity', 'windspeed','casual', 'registered','count']].corr()
f,axes = plt.subplots(1,1,figsize = (8,8))
sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 1.4,ax = axes)",No,5,80.0
"file = file
file['Date'] = pd.DatetimeIndex(file['datetime']).date
file['Hour'] = pd.DatetimeIndex(file['datetime']).hour
file['Day'] = pd.DatetimeIndex(file['datetime']).day
file['Month'] = pd.DatetimeIndex(file['datetime']).month
file['Year'] = pd.DatetimeIndex(file['datetime']).year
file['Weekday'] = pd.DatetimeIndex(file['datetime']).weekday_name",No,5,8.0
"fig = px.line(x = 'Date', y = ""count"", data_frame = file,color = 'Hour',range_y = (0,1150),
              title = 'Interactive LinePlot of the whole dataset(Hover for more details)',
              hover_data = ['Hour','Date','casual','registered'],
              hover_name = 'count_vis', text = None,
              height = 670,width = 980)
fig.show()'",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'season'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'holiday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'workingday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'weather'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Hour'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0
"file1 = file.groupby(['Hour','Weekday']).mean().reset_index()
dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']}
dic1 = {'registered':'Average count of registered poeple commuting.','count': 'Average people commuting','Hour':'Hour of the day',
        'Weekday':'Day of the week'}
fig = px.line(x = 'Hour', y = ""registered"", data_frame = file1.reset_index(),color = 'Weekday',
              title = 'Interactive LinePlot of the registered separated by weekday(Hover for more details)',labels = dic1,
              hover_data = ['count'],category_orders = dic,range_y = [0,550],height = 670,width = 980)
fig.show()'",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Day'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Month'
#order = ['January','February','March','April','May','June','July','August','September','October','November','December']
plot = sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0
"f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Year'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0
"for i in file.groupby('season').count().index:
    s = 's'+str(i)
    a=[]
    for j in file.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)",No,5,53.0
"for i in file.groupby('weather').count().index:
    s = 'w'+str(i)
    a=[]
    for j in file.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)",Yes,5,53.0
"for i in file.groupby('Hour').count().index:
    s = 'Hour'+str(i)
    a=[]
    for j in file.Hour:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)",Yes,5,8.0
"for i in file.groupby(""Month"").count().index:
    s = 'Month' + str(i)
    a = []
    for j in file.Month:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s] = a
file.sample(5)'",Yes,5,53.0
feed.describe(),No,5,40.0
feed.columns,No,5,71.0
"df_train_x = feed.drop('casual',axis = 1).drop('registered',axis=1)
df_train_x.describe()",Yes,4,10.0
"df_reg_train_y = feed['registered']
df_reg_train_y.describe",Yes,5,40.0
"df_cas_train_y = feed['casual']
df_cas_train_y.describe",Yes,4,40.0
"x1_train, x1_test, y1_train, y1_test = train_test_split(df_train_x, df_cas_train_y, test_size=0.15, random_state=42)
x2_train, x2_test, y2_train, y2_test = train_test_split(df_train_x, df_reg_train_y, test_size=0.15, random_state=42)",No,5,13.0
"rf = RandomForestRegressor()

parameters = {'n_estimators':[50,100,150,200,250],
              'min_impurity_decrease':[0.0,0.001,0.01],
              'max_depth':[20,40,60,80,100]}

models = ['Normal Linear Regression: ','Linear Regression over polynomial: ',
          'Decision Tree Regressor: ','XG Boosting: ']",Yes,3,5.0
"predict = []
reg = LinearRegression().fit(x1_train, y1_train)
pre_reg = reg.predict(x1_test)

reg_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg_poly = reg_poly.predict(poly_x1_test)

rf_reg = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_reg.fit(x1_train, y1_train)
pre_rf_reg = rf_reg.predict(x1_test)

predict.append(pre_reg)
predict.append(pre_reg_poly)
predict.append(pre_rf_reg)",Yes,3,7.0
"predict = []
cas = LinearRegression().fit(x2_train, y2_train)
pre_cas = cas.predict(x2_test)

cas_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_cas_poly = cas_poly.predict(poly_x2_test)

rf_cas = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_cas.fit(x2_train, y2_train)
pre_rf_cas = rf_cas.predict(x2_test)

predict.append(pre_cas)
predict.append(pre_cas_poly)
predict.append(pre_rf_cas)",Yes,3,7.0
"print(""For Random Forest Model: "")
print(""\\t Best Parametres for registered are: "",end='')
print(rf_reg.best_params_)
print(""\\t Best Parametres for casual are: "",end = '')
print(rf_cas.best_params_)'",No,2,2.0
"predict1 = []

reg1 = LinearRegression().fit(x1_train, y1_train)
pre_reg1 = reg1.predict(x1_test)

reg1_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg1_poly = reg1_poly.predict(poly_x1_test)

rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(x1_train, y1_train)
pre_rf1 = rf1.predict(x1_test)

for i in range(pre_reg1.size):
    if pre_reg1[i]<1:
        pre_reg1[i] = 1 
    if pre_reg1_poly[i]<1:
        pre_reg1_poly[i] = 1
    if pre_rf1[i]<1:
        pre_rf1[i] = 1

predict1.append(pre_reg1)
predict1.append(pre_reg1_poly)
predict1.append(pre_rf1)

x1_final = x1_test.copy()
x1_final['Output'] = y1_test
x1_final['Lin_reg'] = pre_reg1
x1_final['Lin_reg_poly'] = pre_reg1_poly
x1_final['RF_reg'] = pre_rf1
x1_final['Resid'] = y1_test-pre_reg1
x1_final['Resid_poly'] = y1_test-pre_reg1_poly

for prediction in predict1:
    print(np.sqrt(mean_squared_log_error( y1_test, prediction )))",Yes,2,7.0
"predict2 = []

reg2 = LinearRegression().fit(x2_train, y2_train)
pre_reg2 = reg2.predict(x2_test)

reg2_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_reg2_poly = reg2_poly.predict(poly_x2_test)

rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(x2_train, y2_train)
pre_rf2 = rf2.predict(x2_test)

for i in range(pre_reg2.size):
    if pre_reg2[i]<1:
        pre_reg2[i] = 1 
    if pre_reg2_poly[i]<1:
        pre_reg2_poly[i] = 1
    if pre_rf2[i]<1:
        pre_rf2[i] = 1

predict2.append(pre_reg2)
predict2.append(pre_reg2_poly)
predict2.append(pre_rf2)

x2_final = x2_test.copy()
x2_final['Output'] = y2_test
x2_final['Lin_reg'] = pre_reg2
x2_final['Lin_reg_poly'] = pre_reg2_poly
x2_final['RF_reg'] = pre_rf2
x2_final['Resid'] = y2_test-pre_reg2
x2_final['Resid_poly'] = y2_test-pre_reg2_poly

for prediction in predict2:
    print(np.sqrt(mean_squared_log_error( y2_test, prediction )))",Yes,3,7.0
"from plotly.subplots import make_subplots
name1  = ['Residual for casual without polynomial features'] *1633
name2  = ['Residual for casual with polynomial features'] *1633
name3  = ['Residual for registered without polynomial features'] *1633
name4  = ['Residual for registered with polynomial features'] *1633
dic = {'Lin_reg': 'Predicted Output','Resid':'Deviation from predicted','Output':'Expected Output','Lin_reg_poly': 'Predicted Output',
       'Resid_poly':'Deviation from predicted'}
fig1 = px.scatter(data_frame = x1_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name1,
                  color_discrete_sequence = ['red'])
fig2 = px.scatter(data_frame = x1_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name2,
                  color_discrete_sequence = ['blue'])
fig3 = px.scatter(data_frame = x2_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name3,
                  color_discrete_sequence = ['darkgreen'])
fig4 = px.scatter(data_frame = x2_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name4,
                  color_discrete_sequence = ['gold'])
trace1 = fig1['data'][0]
trace2 = fig2['data'][0]
trace3 = fig3['data'][0]
trace4 = fig4['data'][0]

fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing  = 0.2,
                    row_titles = ['Using Polynomial','Without Polynomial'],column_titles = ['Casual','Registered'],
                    x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)')

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=2, col=1)
fig.add_trace(trace4, row=2, col=2)
fig.show()",No,5,56.0
"rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(df_train_x,df_cas_train_y)
rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(df_train_x,df_reg_train_y)",No,5,7.0
test_file = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv'),No,5,45.0
"test=test_file
test.describe()",No,5,40.0
"test['mth'] = pd.DatetimeIndex(test['datetime']).month
test['yr'] = pd.DatetimeIndex(test['datetime']).year
test['dy'] = pd.DatetimeIndex(test['datetime']).day
test['hr'] = pd.DatetimeIndex(test['datetime']).hour

for i in test.groupby(""season"").count().index:
    s = 's' + str(i)
    a = []
    for j in test.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby(""weather"").count().index:
    s = 'w' + str(i)
    a = []
    for j in test.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby('hr').count().index:
    s = 'hr'+str(i)
    a=[]
    for j in test.hr:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s]=a
for i in test.groupby(""mth"").count().index:
    s = 'm' + str(i)
    a = []
    for j in test.mth:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
test.sample(10)'",No,5,8.0
"pre_cas = rf1.predict(test)
pre_reg = rf2.predict(test)
final_predictions = pd.DataFrame(pre_cas+pre_reg,columns = ['cout'])

final_predictions.describe",Yes,4,48.0
"final_predictions['datetime']=test_file['datetime']
final_predictions = final_predictions[['datetime','count']]",No,5,55.0
"#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import spearmanr
%matplotlib inline
import itertools
import os
import calendar
from datetime import datetime
from scipy import stats
from scipy.special import inv_boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as split
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
",Yes,5,88.0
"def change_to_categorical(data, max_cat=10):
  df = data.copy()
  for col in df.columns:
        if df[col].dtype == object or str(df[col].dtype).startswith(('int','float')):
            count = len(df[col].unique())
            if count <= max_cat:
                df[col] = df[col].astype('category')
  return df",No,4,16.0
"def get_count_plot(x, df, ax, y=None, value_counts = None, print_percent = False):
  if value_counts is None:
    counts = df[x].value_counts().sort_index()
  else:
    counts = df[value_counts]
  #counts.plot.bar()
  #sns bars are just more colorful :P
  if y is None:
    sns.countplot(x, data=df, ax=ax)
  else:
    sns.barplot(x, y, data=df, ax=ax)
  if print_percent:
    print_percent_count_plot(counts, ax)",No,4,33.0
"def get_count_plot_for_categorical(df, n_cols = 2, y='cnt', list_cat=None, value_counts=None, print_percent=False):
  if list_cat is None:
    num_col, cat_col = get_numerical_and_categorical_col(df)
  else:
    cat_col = list_cat
  f, axs, n_rows = get_fig_and_axis_for_subplots(len(cat_col), n_cols)
  for i, col in enumerate(cat_col):
    ax = plt.subplot(n_rows, n_cols, i+1)
    get_count_plot(col, df, ax, y, value_counts, print_percent)",No,5,33.0
"def get_target_dist_with_categorical(df, n_cols = 2, y='cnt', list_cat=None, plot_type = 'box'):
  if list_cat is None:
    num_col, cat_col = get_numerical_and_categorical_col(df)
  else:
    cat_col = list_cat
  f, axs, n_rows = get_fig_and_axis_for_subplots(len(cat_col), n_cols)
  for i, col in enumerate(cat_col):
    ax = plt.subplot(n_rows, n_cols, i+1)
    if plot_type == 'box':
      sns.boxplot(x=col, data=df,y=y,orient=""v"",ax=ax)
    else:
      sns.violinplot(col, data=df,y=y,orient=""v"",ax=ax)'",No,5,33.0
"def get_plot_for_numerical(df, n_cols = 2, plot_type='probability',list_col=None, hist=True, kde=True):
  if list_col is None:
      num_col, cat_col = get_numerical_and_categorical_col(df)
  else:
      num_col = list_col
  f, axs, n_rows = get_fig_and_axis_for_subplots(len(num_col), n_cols)
  for i, col in enumerate(num_col):
    ax = plt.subplot(n_rows, n_cols, i+1)
    if plot_type == 'probability':
      sns.distplot(df[col], hist=hist, kde=hist, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
    elif plot_type == 'box':
      sns.boxplot(data=df,y=col,orient=""v"",ax=ax)
    else:
      sns.violinplot(data=df,y=col,orient=""v"",ax=ax)'",No,5,33.0
"# visualize correlation matrix
def visualize_corr_matrix(data):
    numerical_col, cat_col = get_numerical_and_categorical_col(data)
    df = data[numerical_col]
    corr = df.corr()# plot the heatmap
    #generating masks for upper triangle so that values are not repeated
    mask_ut=np.triu(np.ones(corr.shape)).astype(np.bool)
    sns.heatmap(corr, mask=mask_ut, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))",No,5,80.0
"def remove_outliers_for_variable_by_quantiles(data, col, q1=.25, q2=.75):
  df = data.copy()
  median = df[col].median()
  q25, q75 = df[col].quantile([q1,q2])
  iqr = q75-q25
  upper_wh = q75 +1.5*iqr
  lower_wh = q25 - 1.5*iqr
  whiskers = int(np.floor(lower_wh)), int(np.ceil(upper_wh))
  df.drop(df[~df[col].between(whiskers[0], whiskers[1]) & (~np.isnan(df[col]))].index, inplace=True)
  return df",No,5,8.0
"def remove_outliers_for_variable_by_std(data, col):
  df = data.copy()
  df = df[np.abs(df[col]-df[col].mean())<=(3*df[col].std())] 
  return df",No,5,14.0
"#loop for chi square values
def calculate_chi_square_values(df, alpha=.05):
    chi2_dict = {}
    numerical_col, cat_col  = get_numerical_and_categorical_col(df)
    for i in cat_col:
        for j in cat_col:
            if i!=j and (j+' '+i) not in chi2_dict.keys():
                chi2, p, dof, ex = chi2_contingency(pd.crosstab(df[i], df[j]))
                chi2_dict[i+' '+j] = 'Independent? '+ str(p>alpha)
    return chi2_dict",No,5,47.0
"def rmsle(y, y_,convertExp=True):
    
    if convertExp:
        y = inv_boxcox(y, fitted_lambda),
        y_ = inv_boxcox(y_, fitted_lambda)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))",No,5,84.0
"def plot_prediction(test, test_pred, train, train_pred, convert_to_original_form = False):
    if convert_to_original_form:
        test = inv_boxcox(test, fitted_lambda),
        test_pred = inv_boxcox(test_pred, fitted_lambda)
        train = inv_boxcox(train, fitted_lambda),
        train_pred = inv_boxcox(train_pred, fitted_lambda)
    f, ax = plt.subplots(1,2, figsize=(10, 5))
    ax1 = plt.subplot(1,2,1)
    sns.distplot(test, hist=True, kde=True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax = ax1)
    sns.distplot(test_pred, hist=True, kde=True, 
             color = 'red', 
             hist_kws={'edgecolor':'red'},
             kde_kws={'linewidth': 4}, ax = ax1)
    ax1.set_title(""Actual vs Predicted (Test)"")
    
    ax2 = plt.subplot(1,2,2)
    sns.distplot(train, hist=True, kde=True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax = ax2)
    sns.distplot(train_pred, hist=True, kde=True, 
             color = 'red', 
             hist_kws={'edgecolor':'red'},
             kde_kws={'linewidth': 4}, ax = ax2)
    ax2.set_title(""Actual vs Predicted (Train)"")'",No,5,56.0
"df_hour = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")
df_test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"")",No,5,45.0
df_hour.head(),No,5,41.0
"df_hour[""dteday""] = df_hour.datetime.apply(lambda x : x.split()[0])
df_hour[""yr""] = df_hour.datetime.apply(lambda x : x.split()[0][:4])
df_hour['yr'] = df_hour.yr.map({'2011': 0, '2012':1})
df_hour[""hr""] = df_hour.datetime.apply(lambda x : x.split()[1].split("":"")[0])
df_hour[""weekday""] = df_hour.dteday.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])
df_hour.weekday = df_hour.weekday.map({'Saturday':6, 'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5})
df_hour[""mnth""] = df_hour.dteday.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month])
df_hour.mnth = df_hour.mnth.map({'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6,
       'August':7, 'September':8, 'October':9, 'November':10, 'December':11})
df_hour[""weathersit""] = df_hour.weather
df_hour['dteday'] = pd.to_datetime(df_hour['dteday'])
del df_hour['weather']
del df_hour['datetime']'",Yes,4,8.0
"#performing same on test
#performing same on test
df_test[""dteday""] = df_test.datetime.apply(lambda x : x.split()[0])
df_test[""yr""] = df_test.datetime.apply(lambda x : x.split()[0][:4])
df_test['yr'] = df_test.yr.map({'2011': 0, '2012':1})
df_test[""hr""] = df_test.datetime.apply(lambda x : x.split()[1].split("":"")[0])
df_test[""weekday""] = df_test.dteday.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])
df_test.weekday = df_test.weekday.map({'Saturday':6, 'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5})
df_test[""mnth""] = df_test.dteday.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month])
df_test.mnth = df_test.mnth.map({'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6,
       'August':7, 'September':8, 'October':9, 'November':10, 'December':11})
df_test[""weathersit""] = df_test.weather
df_test['dteday'] = pd.to_datetime(df_test['dteday'])
del df_test['weather']'",Yes,5,8.0
"df_hour['cnt'] = df_hour['count']
del df_hour['count']
df_hour.shape",Yes,4,61.0
df_hour.info(),No,5,40.0
df_hour.columns,No,5,71.0
"#Creating a copy of df to preserve original df
df = df_hour.copy()",No,5,12.0
"#Analysing which of these are categorical variables
for col in df.columns:
  print('Count of unique values for ', col, ': ', len(df[col].unique()))",No,5,54.0
"'''We know maximum categories for any categorical col is 24 (for month)
Hence we can use this to def function to convert variable to categorical type'''
df = change_to_categorical(df, max_cat=24)",No,5,16.0
"dataTypeDf = (df.dtypes.astype(str).value_counts()).reset_index().rename(columns={""index"":""variableType"",0:""count""})
fig,ax = plt.subplots()
fig.set_size_inches(12,5)
get_count_plot('variableType',dataTypeDf, ax, 'count', value_counts='count', print_percent=True)'",No,3,33.0
df[get_numerical_and_categorical_col(df)[0]].describe(),No,5,40.0
df[get_numerical_and_categorical_col(df)[1]].describe(include='all'),No,5,40.0
"get_plot_for_numerical(df,3)",No,3,33.0
"plt.figure(figsize=(7,5))
visualize_corr_matrix(df)",No,3,80.0
"'''Let's start creating a list which contains all the variables to be deleted.
We can delete them once we're done with our exploratory analysis'''
cols_to_remove = ['registered','casual','windspeed']
#besides atemp should be deleted immediately for obvious reasons!
del df['atemp']
del df_test['atemp']",No,5,10.0
"get_plot_for_numerical(df, 3, plot_type='box')",No,5,33.0
"get_target_dist_with_categorical(df,n_cols=3)",No,5,53.0
"get_target_dist_with_categorical(df, n_cols=2, plot_type='violin')",No,5,33.0
"#Let's perform categorical test chi2 to decide which categorical columns to delete
chi2_dict = calculate_chi_square_values(df)
chi2_dict",No,5,47.0
"sns.pointplot(x='hr',y='cnt',data=df, hue='season', markers = 'x')",No,5,33.0
"sns.pointplot(x='hr',y='cnt',data=df, hue='weekday', markers = 'x')",No,5,33.0
"#to visualize similar plot for type of user, we would need to use melt
#what melt would do, take each hour and generate rows for value variables. Next we'll use this to find mean for each hour and for each type of users
hr_users_type = pd.melt(df[[""hr"",""casual"",""registered""]], id_vars=['hr'], value_vars=['casual', 'registered']).sort_values(by='hr')
hr_users_type.head()'",Yes,2,8.0
"hr_users_type_mean = pd.DataFrame(hr_users_type.groupby([""hr"",""variable""],sort=True)[""value""].mean()).reset_index()
hr_users_type_mean.head()",Yes,4,12.0
"sns.pointplot(x=hr_users_type_mean[""hr""], y=hr_users_type_mean[""value""],hue=hr_users_type_mean[""variable""],hue_order=[""casual"",""registered""], data=hr_users_type_mean, join=True)",No,5,33.0
"from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
# drop target columns
df_original = df.copy()
drop_cols=['cnt', 'dteday','registered','casual']
X = df.drop(drop_cols, axis = 1) # X = independent columns (potential predictors)
y = df['cnt'] # y = target column (what we want to predict)
# instantiate RandomForestClassifier
rf_model = RandomForestRegressor()
rf_model.fit(X,y)
feat_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
# determine 20 most important features
df_imp_feat = feat_importances.nlargest(20)
df_imp_feat.plot(kind='bar')
plt.show()
print(df_imp_feat)
print('Comparing with our columns')
print(cols_to_remove)",Yes,3,7.0
"df_cleaned = df.copy()
df_cleaned.drop(cols_to_remove, axis=1, inplace=True)
df_test.drop(cols_to_remove, axis=1, inplace=True, errors='ignore')
df_cleaned.head()",Yes,5,10.0
df_cleaned.describe(),No,5,40.0
"df_cleaned.drop(['cnt','cnt_log','box_cox_reverse'], axis=1, inplace=True)
df_cleaned.rename(columns={'cnt_box_cox':'count_transformed'}, inplace=True)",Yes,4,10.0
sc = StandardScaler(),No,2,18.0
"target = 'count_transformed'
X = df_cleaned.drop(target, axis=1)
y = df_cleaned[target]
seed=23
X_train, X_test, y_train, y_test = split(X, y, test_size=.3, random_state=seed)",Yes,5,13.0
"X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)",No,4,18.0
"from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn import metrics

# Initialize logistic regression model
lr = LinearRegression()

# Train the model
lr.fit(X_train,y = y_train)

# Make predictions
y_pred = lr.predict(X_test)

y_pred_train = lr.predict(X_train)

print('RMSLE for test: ',rmsle(y_test, y_pred, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_train, True))",Yes,3,7.0
"coeff_df = pd.DataFrame(lr.coef_, X.columns, columns=['Coefficient'])  
coeff_df",No,4,79.0
"plot_prediction(y_test, y_pred, y_train, y_pred_train, True)",No,4,56.0
"ridge = Ridge()
ridge_param = {'max_iter':[3000], 'alpha':[.1,.03,.3,1,3,10, 30, 100,300]}
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
grid_ridge = GridSearchCV(ridge,
                           ridge_param,
                           scoring = rmsle_scorer,
                           cv = 10)
grid_ridge.fit(X_train, y_train)
y_pred_ridge = grid_ridge.predict(X_test)
y_pred_ridge_train = grid_ridge.predict(X_train)
print('Grid Ridge Best Params: ', grid_ridge.best_params_)
print('RMSLE for test: ',rmsle(y_test, y_pred_ridge, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_ridge_train, True))",Yes,3,6.0
"fig,ax= plt.subplots()
fig.set_size_inches(12,5)
df = pd.DataFrame(grid_ridge.cv_results_)
df[""alpha""] = df[""params""].apply(lambda x:x[""alpha""])
df[""rmsle""] = df[""mean_test_score""].apply(lambda x:-x)
sns.pointplot(data=df,x=""alpha"",y=""rmsle"",ax=ax)",No,5,81.0
"plot_prediction(y_test, y_pred_ridge, y_train, y_pred_ridge_train, True)",No,4,56.0
"lasso = Lasso()
alpha = 1/np.array([.1,.03,.3,1,3,10, 30, 100,300,1000])
lasso_param = {'max_iter':[3000], 'alpha':alpha}
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
random_lasso = RandomizedSearchCV(lasso,
                           lasso_param,
                           scoring = rmsle_scorer,
                           cv = 10)
random_lasso.fit(X_train, y_train)
y_pred_lasso = random_lasso.predict(X_test)
y_pred_lasso_train = random_lasso.predict(X_train)
print('Random Lasso Best Params: ', random_lasso.best_params_)
print('RMSLE for test: ',rmsle(y_test, y_pred_lasso, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_lasso_train, True))",No,3,6.0
"fig,ax= plt.subplots()
fig.set_size_inches(12,5)
df = pd.DataFrame(random_lasso.cv_results_)
df[""alpha""] = df[""params""].apply(lambda x:x[""alpha""])
df[""rmsle""] = df[""mean_test_score""].apply(lambda x:-x)
sns.pointplot(data=df,x=""alpha"",y=""rmsle"",ax=ax)",No,5,84.0
"plot_prediction(y_test, y_pred_lasso, y_train, y_pred_lasso_train, True)",No,4,56.0
"from sklearn.tree import DecisionTreeRegressor as dt
dt_m = dt(random_state=0)
dt_m.fit(X_train,y_train)
y_pred_dt=dt_m.predict(X_test)
y_pred_dt_train=dt_m.predict(X_train)
print('RMSLE for test: ',rmsle(y_test, y_pred_dt, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_dt_train, True))",Yes,4,7.0
"plot_prediction(y_test, y_pred_dt, y_train, y_pred_dt_train, True)",No,4,56.0
"from sklearn.ensemble import RandomForestRegressor as rfr
rf = rfr(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_pred_rf_train = rf.predict(X_train)
print('RMSLE for test: ',rmsle(y_test, y_pred_rf, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_rf_train, True))",Yes,3,7.0
"plot_prediction(y_test, y_pred_rf, y_train, y_pred_rf_train, True)",No,4,56.0
"from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=3000,alpha=.03)
gbm.fit(X_train,y_train)
y_pred_gbm = gbm.predict(X_test)
y_pred_gbm_train = gbm.predict(X_train)
print('RMSLE for test: ',rmsle(y_test, y_pred_gbm, True))
print('RMSLE for train: ',rmsle(y_train, y_pred_gbm_train, True))",Yes,3,7.0
"plot_prediction(y_test, y_pred_gbm, y_train, y_pred_gbm_train, True)",No,4,56.0
"df_test = df_test.sort_values(by='datetime')
datetime_series = df_test.datetime
df_test_for_model = df_test.copy()
df_test_for_model.drop(['datetime'], inplace=True, axis=1)
X_test_ndarry = df_test_for_model.to_numpy()
final_X_test = sc.fit_transform(X_test_ndarry)
final_y_pred = inv_boxcox(gbm.predict(final_X_test), fitted_lambda)",Yes,3,21.0
"final_y_pred.shape, datetime_series.shape",No,5,58.0
"submission = pd.DataFrame({'datetime':datetime_series, 'count':np.round(final_y_pred)})",No,5,55.0
"submission_rf = pd.DataFrame({'datetime':datetime_series, 'count':final_y_pred_rf})",No,5,12.0
"sns.distplot(final_y_pred, hist=True, kde=True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})",No,5,33.0
"#Just to avoid any missing values
submission[submission['count'].isna()]",No,4,39.0
"#let's see values around it to fill these
#intuition knn
submission.iloc[721],submission.iloc[720],submission.iloc[719]",No,4,41.0
"#By this and also by our initial analysis, afternoon is not a preferable time to ride bike
submission.fillna(0, inplace=True)",No,5,17.0
"submission.iloc[725],submission.iloc[726],submission.iloc[727]",No,4,41.0
"submission.to_csv('bike_predictions_rounded.csv', index=False)
submission.to_csv('bike_predictions_random_forest.csv', index=False)",No,5,25.0
"import pandas as pd
import calendar
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import boxcox, inv_boxcox
from datetime import datetime
from numpy import arange
from pandas import read_csv
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


train_df=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
train_df.head(6)",Yes,4,45.0
"#Fig Will show count distribution
sns.distplot(train_df['count'])
plt.show()
",No,5,33.0
"#Fig Will show count distribution post log transformation
train_df['count']=train_df['count'].apply(lambda x:np.log(x))
sns.distplot(train_df['count'])
plt.show()
print (train_df['count'])
train_df.shape",Yes,4,33.0
"#Visualize Count wrt categorical variables
cat_names=['season', 'holiday', 'workingday', 'weather']
i=0
for name in cat_names:
    i=i+1
    plt.subplot(2,2,i)
    sns.countplot(name,data=train_df) 
plt.show()",No,5,33.0
"#Visualize data wrt continous variables. 
cont_names=['temp','atemp','humidity','windspeed']
i=0
for name in cont_names:
    i=i+1
    plt.subplot(2,2,i)
    sns.boxplot(name,data=train_df) 
plt.show()
#Windspeed seems to be skewed",No,5,33.0
"#Splitting out Datetime attribute in dataframe and dropping unwanted variables as per before analysis
new_df=train_df.copy(deep=True)
new_df['day']=new_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d %H:%M:%S"").weekday()])
new_df['datetime'] = pd.to_datetime(new_df['datetime'], format='%Y-%m-%d %H:%M:%S')
new_df['month']=new_df['datetime'].apply(lambda x:x.month)
new_df['hour']=new_df['datetime'].apply(lambda x:x.hour)
new_df['year']=new_df['datetime'].apply(lambda x:x.year)
final_df=new_df.copy(deep=True)
final_df=new_df.drop(['datetime','temp','casual','registered'], axis=1)
final_df.head()'",Yes,3,16.0
"#adding dummy varibles to categorical variables dropping the souce columns
weather_df=pd.get_dummies(final_df['weather'],prefix='w',drop_first=True)
yr_df=pd.get_dummies(final_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(final_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(final_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(final_df['season'],prefix='s',drop_first=True)
day_df=pd.get_dummies(final_df['day'],prefix='d',drop_first=True)

final_df=final_df.drop(['weather','year','month','hour','season','day'], axis=1)

final_df=final_df.join(weather_df)
final_df=final_df.join(yr_df)
final_df=final_df.join(month_df)                     
final_df=final_df.join(hour_df)
final_df=final_df.join(season_df)
final_df=final_df.join(day_df)",Yes,4,20.0
"print(final_df.columns.to_series().groupby(final_df.dtypes).groups)
final_df.head(5)",Yes,4,41.0
"#Initializing training set
X=final_df.iloc[:,final_df.columns!='count'].values
Y=final_df.iloc[:,5].values",No,5,21.0
"#Ridge Regression Implementation 10 Folds
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)",Yes,5,6.0
"#Lasso Regression Implementation 10 Folds
# define model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)",Yes,5,6.0
"#Decision TreeImplementation 10 Folds
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
dtm = DecisionTreeRegressor(random_state=42)
param_grid = {""criterion"": [""mse"", ""mae""],
              }
search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv)

results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)'",Yes,4,6.0
"#Decision Tree with Pruning with 10 Folds
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
dtm = DecisionTreeRegressor(random_state=42)
param_grid = {""criterion"": [""mse"", ""mae""],
              ""max_depth"": [2, 6, 8],
              }
search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv)

results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)'",Yes,5,6.0
"grid = dict()
grid['n_estimators'] = [1000]
grid['max_depth'] = [125,150,175]
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = RandomForestRegressor()

search = GridSearchCV(model,param_grid=grid, scoring='neg_mean_squared_error', cv=cv)
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)",Yes,4,6.0
"def grid_search():
    from sklearn.ensemble import GradientBoostingRegressor
    print ('lets go')

    model = GradientBoostingRegressor()
    # define the grid of values to search
    grid = dict()
    grid['n_estimators'] = [4000]
    grid['learning_rate'] = [ 0.001, 0.01, 0.1]
    grid['max_depth'] = [4]

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the grid search procedure
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')
    grid_search=grid_search.fit(X,Y)
    best_accuracy=grid_search.best_score_
    best_parameters=grid_search.best_params_
    print (best_accuracy)
    print (best_parameters)
    
grid_search()",Yes,5,6.0
"from sklearn.ensemble import GradientBoostingRegressor
rgr=GradientBoostingRegressor(learning_rate=0.1,n_estimators=4000, max_depth=4)
rgr.fit(X,Y)",Yes,5,7.0
"test_df=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
test_df['day']=test_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d %H:%M:%S"").weekday()])
test_df['datetime']=pd.to_datetime(test_df['datetime'], format='%Y-%m-%d %H:%M:%S')
test_df['month']=test_df['datetime'].apply(lambda x:x.month)
test_df['hour']=test_df['datetime'].apply(lambda x:x.hour)
test_df['year']=test_df['datetime'].apply(lambda x:x.year)
test_df=test_df.drop(['datetime','temp'], axis=1)

#adding dummy varibles to categorical variables
weather_df=pd.get_dummies(test_df['weather'],prefix='w',drop_first=True)
yr_df=pd.get_dummies(test_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(test_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(test_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(test_df['season'],prefix='s',drop_first=True)
day_df=pd.get_dummies(test_df['day'],prefix='d',drop_first=True)

test_df=test_df.drop(['weather','year','month','hour','season','day'], axis=1)

test_df=test_df.join(weather_df)
test_df=test_df.join(yr_df)
test_df=test_df.join(month_df)                     
test_df=test_df.join(hour_df)
test_df=test_df.join(season_df)
test_df=test_df.join(day_df)'",Yes,3,16.0
"temp=pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
X_test=test_df.iloc[:,:].values
y_output=rgr.predict(X_test)
y_output
op=pd.DataFrame({'count':np.exp(y_output)})
op['datetime']=temp['datetime']
op.to_csv('finalSubmission.csv', index=False)",Yes,3,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline",No,5,23.0
"train = pd.read_csv('../input/bike-sharing-demand/train.csv')
test = pd.read_csv('../input/bike-sharing-demand/test.csv')",No,5,45.0
"tt = train.append(test)
tt = tt.reset_index().drop('index', axis=1)
tt.head()",Yes,3,11.0
sns.distplot(train['count']),No,5,33.0
"# Time processing
# Add two columns, date and hour respectively
temp = pd.DatetimeIndex(train['datetime'])
train['year'] = temp.year
train['date'] = temp.date
train['hour'] = temp.hour
# Categorical variables for the day of the week
train['dayofweek'] = pd.DatetimeIndex(train.date).dayofweek",No,5,8.0
"# The impact of each time period of the day on count
sns.boxplot(train['hour'], train['count'])",No,5,75.0
"# The influence of the total days of the week on count
sns.boxplot(train['dayofweek'], train['count'])",No,5,75.0
"# Changes in count for each day of the week
sns.pointplot(x='hour', y='count', hue='dayofweek', data=train)",No,5,75.0
"# The impact of different months on count
train['month'] = pd.to_datetime(train['datetime']).dt.month
sns.boxplot(train['month'], train['count'])",Yes,5,81.0
"# The impact of holidays on count
sns.pointplot(x='hour', y='count',hue='workingday', data=train)",No,5,75.0
"# The impact of weather on count
sns.pointplot(x='hour', y='count', hue='weather', data=train)",No,5,75.0
"# The influence of season on count
sns.pointplot(x='hour', y='count', hue='season', data=train)",No,5,75.0
"# Pearson coefficient
cor=train[['temp', 'atemp', 'casual', 'registered', 'humidity','windspeed', 'count']].corr()
sns.heatmap(cor, square=True, annot=True)",No,5,80.0
"temp = pd.DatetimeIndex(tt['datetime'])
tt['year'] = temp.year
tt['hour'] = temp.hour
tt = tt[['hour', 'year', 'workingday', 'holiday', 'season', 'weather', 'atemp', 'count']]
# One-hot coding for discrete variables, such as color red, yellow, and blue coding as [[1,0,0], [0,1,0], [0,0,1]]
tt = pd.get_dummies(tt, columns=['hour'], prefix=['hour'], drop_first=True)
tt = pd.get_dummies(tt, columns=['year'], prefix=['year'], drop_first=True)
tt = pd.get_dummies(tt, columns=['season'], prefix=['season'], drop_first=True)
tt = pd.get_dummies(tt, columns=['weather'], prefix=['weather'], drop_first=True)
tt.head()",Yes,3,20.0
"# Extract the training set and test set from the processed data set, [0:10886] and [10886:]
new_train = tt.iloc[:10886, :]
# Pair count+1, then take the logarithm
y = np.log1p(new_train['count'])
new_test = tt.iloc[10886:, :].drop('count',axis=1)
new_train.drop('count', axis=1, inplace=True)
x = new_train
x.head()",Yes,5,10.0
"from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)",Yes,5,13.0
"from sklearn.linear_model import LinearRegression
lmodel = LinearRegression()
lmodel.fit(x, y)
cross_val_score(lmodel, x, y, cv=5).mean()",Yes,4,7.0
"lmodel.fit(x_train, y_train)
pre = lmodel.predict(x_test)
mean_squared_error(y_test, pre)",Yes,3,7.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfr = RandomForestRegressor(random_state=50, max_features='sqrt', oob_score=True)",Yes,5,4.0
"# Parameter tuning - This step requires a lot of calculation
para = {'n_estimators': np.arange(200, 241, 1)}
rf = GridSearchCV(estimator=rfr, param_grid=para, cv=5)
rf.fit(x, y)",Yes,5,6.0
rf.best_params_,No,5,2.0
"rfr = RandomForestRegressor(n_estimators=227, random_state=50, max_features='sqrt',oob_score=True)
cross_val_score(rfr, x, y, cv=5).mean()",No,4,28.0
"rfr.fit(x_train, y_train)
pre = rfr.predict(x_test)
mean_squared_error(y_test, pre)",Yes,3,7.0
"rfr.fit(x,y)",No,5,7.0
"co = rfr.predict(new_test)
m = []
# Decrease the result by one and round up
for i in (np.exp(co) - 1):
    n = round(i)  
    m.append(n)
predict = pd.DataFrame({'datetime': test['datetime'], 'count': m})
predict.to_csv('rfr.csv', index=False)
",Yes,4,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()",Yes,5,23.0
"# Example
# Converts to log1p(count)
# Print original count back using expm1
print('Test log and exp')
test_count = 100
print('original value', test_count)
x = np.log1p(test_count) # log (x+1)
print('log1p', x)
print('expm1', np.expm1(x)) # exp(x) - 1",No,3,8.0
"df = pd.read_csv('../input/bike-sharing-demand/train.csv',parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('../input/bike-sharing-demand/test.csv',parse_dates=['datetime'],index_col=0)
",No,5,45.0
"# We need to convert datetime to numeric for training.
# Let's extract key features into separate numeric columns
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour",No,5,8.0
"# Need to predict the missing data
plt.title('Rental Count - Gaps')
df['2011-01':'2011-02']['count'].plot()
plt.show()",No,5,81.0
"# Rentals change hourly!
plt.plot(df['2011-01-01']['count'])
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('Hourly Rentals for Jan 01, 2011')
plt.show()",No,5,81.0
"# Seasonal
plt.plot(df['2011-01']['count'])
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('Jan 2011 Rentals (1 month)')
plt.show()",No,5,75.0
"group_hour = df.groupby(['hour'])
average_by_hour = group_hour['count'].mean()",Yes,5,60.0
"plt.plot(average_by_hour.index,average_by_hour)
plt.xlabel('Hour')
plt.ylabel('Rental Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count')",No,5,75.0
"# Year to year trend
plt.plot(df['2011']['count'],label='2011')
plt.plot(df['2012']['count'],label='2012')
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.legend()
plt.show()",No,5,75.0
"plt.plot(df['2011']['count'].map(np.log1p),label='2011')
plt.plot(df['2012']['count'].map(np.log1p),label='2012')
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Log(Rental Count)')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.legend()
plt.show()",No,5,75.0
"plt.boxplot([df['count']], labels=['count'])
plt.title('Box Plot - Count')
plt.ylabel('Target')
plt.grid(True)",No,3,33.0
"# Let's see how the data distribution changes with log1p
# Evenly distributed
plt.boxplot([df['count'].map(np.log1p)], labels=['log1p(count)'])
plt.title('Box Plot - log1p(Count)')
plt.ylabel('Target')
plt.grid(True)",No,3,33.0
"df[""count""] = df[""count""].map(np.log1p)",No,5,8.0
"group_year_month = df.groupby(['year','month'])",No,5,60.0
average_year_month = group_year_month['count'].mean(),No,2,8.0
average_year_month,No,5,41.0
"for year in average_year_month.index.levels[0]:
    plt.plot(average_year_month[year].index,average_year_month[year],label=year)
    
plt.legend()    
plt.xlabel('Month')
plt.ylabel('Count')
plt.grid(True)
plt.title('Average Monthly Rental Count for 2011, 2012')
plt.show()",No,4,75.0
"group_year_hour = df.groupby(['year','hour'])
average_year_hour = group_year_hour['count'].mean()
for year in average_year_hour.index.levels[0]:
    #print (year)
    #print(average_year_month[year])
    plt.plot(average_year_hour[year].index,average_year_hour[year],label=year)
    
plt.legend()    
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count - 2011, 2012')",Yes,4,75.0
"group_workingday_hour = df.groupby(['workingday','hour'])
average_workingday_hour = group_workingday_hour['count'].mean()",Yes,5,60.0
"for workingday in average_workingday_hour.index.levels[0]:
    #print (year)
    #print(average_year_month[year])
    plt.plot(average_workingday_hour[workingday].index,average_workingday_hour[workingday],
             label=workingday)
    
plt.legend()    
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count by Working Day')
plt.show()",No,4,75.0
"# Let's look at correlation beween features and target
df.corr()['count']",No,5,40.0
"# Any relation between temperature and rental count?
plt.scatter(x=df.temp,y=df[""count""])
plt.grid(True)
plt.xlabel('Temperature')
plt.ylabel('Count')
plt.title('Temperature vs Count')
plt.show()'",No,5,33.0
"# Any relation between humidity and rental count?
plt.scatter(x=df.humidity,y=df[""count""],label='Humidity')
plt.grid(True)
plt.xlabel('Humidity')
plt.ylabel('Count')
plt.title('Humidity vs Count')
plt.show()'",No,5,33.0
"# Save all data
df.to_csv('bike_all.csv',index=True,index_label='datetime',columns=columns)",No,5,25.0
"# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.loc[l]",No,5,15.0
"rows = df.shape[0]
train = int(.7 * rows)
test = rows-train",No,4,13.0
"rows, train, test",No,5,41.0
columns,No,5,71.0
"# Write Training Set
df.iloc[:train].to_csv('bike_train.csv'
                          ,index=False,header=False
                          ,columns=columns)",No,5,25.0
"# Write Validation Set
df.iloc[train:].to_csv('bike_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)",No,5,25.0
"# Test Data has only input features
df_test.to_csv('bike_test.csv',index=True,index_label='datetime')",No,5,25.0
"print(','.join(columns))",No,3,71.0
"# Write Column List
with open('bike_train_column_list.txt','w') as f:
    f.write(','.join(columns))",No,5,84.0
"# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==0.90",No,5,87.0
"import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# XGBoost 
import xgboost as xgb",No,5,22.0
"column_list_file = 'bike_train_column_list.txt'
train_file = 'bike_train.csv'
validation_file = 'bike_validation.csv'
test_file = 'bike_test.csv'",No,5,77.0
"columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')",No,5,88.0
"# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)",No,5,45.0
"X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()",No,5,21.0
"# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)
regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)",No,5,4.0
regressor,No,2,4.0
"regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])",No,5,7.0
eval_result = regressor.evals_result(),No,5,28.0
training_rounds = range(len(eval_result['validation_0']['rmse'])),No,3,77.0
"plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()",No,4,35.0
"xgb.plot_importance(regressor)
plt.show()",No,5,79.0
"# Verify Quality using Validation dataset
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)",No,5,45.0
"X_test = df.iloc[:,1:]
print(X_test[:5])",Yes,4,13.0
result = regressor.predict(X_test),No,5,48.0
result[:5],No,4,41.0
df['count_predicted'] = result,No,5,8.0
"# Negative Values are predicted
df['count_predicted'].describe()",No,5,40.0
df[df['count_predicted'] < 0],No,5,14.0
"df['count_predicted'].hist()
plt.title('Predicted Count Histogram')
plt.show()",No,5,33.0
"def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x",No,3,8.0
df['count_predicted'] = df['count_predicted'].map(adjust_count),No,3,8.0
"df['count'] = df['count'].map(np.expm1)
df['count_predicted'] = df['count_predicted'].map(np.expm1)",No,5,8.0
"# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()",No,4,56.0
"# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()",Yes,5,33.0
"print(""RMSE: {0:0.2f}"".format(mean_squared_error(df['count'],df['count_predicted'])**.5))'",No,5,49.0
"# RMSlE - Root Mean Squared Log Error
# RMSLE Metric is used by Kaggle for this competition

# RMSE Cost Function - Magnitude of difference matters

# RMSLE cost function - ""Only Percentage difference matters""

# Reference:Katerina Malahova, Khor SoonHin 
# https://www.slideshare.net/KhorSoonHin/rmsle-cost-function
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)",No,5,84.0
"print('RMSLE')
print(compute_rmsle(100,50),
      compute_rmsle(1000,500),
      compute_rmsle(10000,5000))",No,2,49.0
"print('RMSLE')
print(compute_rmsle(100,25),
      compute_rmsle(1000,250),
      compute_rmsle(10000,2500))",No,2,49.0
"print('RMSE')
print(mean_squared_error([100],[50])**.5,
      mean_squared_error([1000],[500])**.5, 
      mean_squared_error([10000],[5000])**.5)",No,2,49.0
"print('RMSE')
print(mean_squared_error([100],[25])**.5,
      mean_squared_error([1000],[250])**.5, 
      mean_squared_error([10000],[2500])**.5)",No,2,49.0
"print(""RMSLE: {0}"".format(compute_rmsle(df['count'],df['count_predicted'])))'",No,5,49.0
"# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])",No,5,45.0
"X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction",No,4,13.0
np.expm1(result),No,4,8.0
"# Convert result to actual count
df_test[""count""] = np.expm1(result)",No,5,8.0
"df_test[df_test[""count""] < 0]",No,5,14.0
"df_test[['datetime','count']].to_csv('predicted_count_log.csv',index=False)",No,5,25.0
"submission.iloc[1258:1269, 1]= submission.iloc[1258:1269, 1]*0.5
submission.iloc[4492:4515, 1]= submission.iloc[4492:4515, 1]*0.5
# 
submission.iloc[6308:6330, 1]= submission.iloc[6308:6330, 1]*0.5
submission.iloc[3041:3063, 1]= submission.iloc[3041:3063, 1]*0.5
#
submission.iloc[6332:6354, 1]= submission.iloc[6332:6354, 1]*0.5
submission.iloc[3065:3087, 1]= submission.iloc[3065:3087, 1]*0.5
#
submission.iloc[5992:6015, 1]= submission.iloc[5992:6015, 1]*0.5
submission.iloc[2771:2794, 1]= submission.iloc[2771:2794, 1]*0.5'",No,5,14.0
"submission.drop(""holiday"",1,inplace=True)
submission.to_csv(""allrf2.csv"", index=False)",No,5,25.0
"import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline",No,5,23.0
"train_df = pd.read_csv('../input/bike-sharing-demand/train.csv')
test_df = pd.read_csv('../input/bike-sharing-demand/test.csv')",No,5,45.0
"print(train_df.shape)
print(test_df.shape)",No,5,58.0
"train_df['datetime'] = pd.to_datetime(train_df['datetime'])
test_df['datetime'] = pd.to_datetime(test_df['datetime'])",No,5,16.0
"train_df['year'] = train_df['datetime'].apply(lambda x: x.year)
train_df['month'] = train_df['datetime'].apply(lambda x: x.month)
train_df['day'] = train_df['datetime'].apply(lambda x: x.day)
train_df['hour'] = train_df['datetime'].apply(lambda x: x.hour)

test_df['year'] = test_df['datetime'].apply(lambda x: x.year)
test_df['month'] = test_df['datetime'].apply(lambda x: x.month)
test_df['day'] = test_df['datetime'].apply(lambda x: x.day)
test_df['hour'] = test_df['datetime'].apply(lambda x: x.hour)",No,5,8.0
"train_df = train_df.drop(['datetime', 'casual', 'registered'], axis=1)
test_df = test_df.drop(['datetime'], axis=1)",No,5,10.0
"def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred)**2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle",No,5,84.0
sns.distplot(train_df['count']),No,5,33.0
sns.distplot(np.log1p(train_df['count'])),No,5,33.0
train_df['count'] = np.log1p(train_df['count']),No,5,8.0
"from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['count'], axis=1), train_df['count'], test_size=0.3)",Yes,4,13.0
"lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,4,28.0
"coef = pd.Series(lr_reg.coef_, index=X_train.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0
"train_df = pd.get_dummies(train_df, columns=['year', 'month', 'day', 'hour', 'holiday', 'workingday', 'season', 'weather'])
test_df = pd.get_dummies(test_df, columns=['year', 'month', 'day', 'hour', 'holiday', 'workingday', 'season', 'weather'])",No,5,20.0
"train_df, test_df = train_df.align(test_df, join='left', axis=1)
test_df = test_df.drop(['count'], axis=1)",Yes,4,10.0
"X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['count'], axis=1), train_df['count'], test_size=0.3)",No,5,13.0
"lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('LinearRegression RMSLE:', rmsle(y_test_exp, pred_exp))",No,3,28.0
"ridge_reg = Ridge(alpha=10)
ridge_reg.fit(X_train, y_train)
pred = ridge_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('Ridge RMSLE:', rmsle(y_test_exp, pred_exp))",No,4,4.0
"lasso_reg = Lasso(alpha=0.01)
lasso_reg.fit(X_train, y_train)
pred = lasso_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('Lasso RMSLE:', rmsle(y_test_exp, pred_exp))",No,3,28.0
"coef = pd.Series(lr_reg.coef_, index=X_train.columns)
coef_sort = coef.sort_values(ascending=False)[:25]
sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor",No,5,22.0
"rf_reg = RandomForestRegressor(n_estimators=500)
rf_reg.fit(X_train, y_train)
pred = rf_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('RandomForestRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0
"gbm_reg = GradientBoostingRegressor(n_estimators=500)
gbm_reg.fit(X_train, y_train)
pred = gbm_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('GradientBoostingRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0
"xgb_reg = XGBRegressor(n_estimators=500)
xgb_reg.fit(X_train, y_train)
pred = xgb_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('XGBRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0
"lgbm_reg = LGBMRegressor(n_estimators=500)
lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

y_test_exp = np.expm1(y_test)
pred_exp = np.expm1(pred)
print('LGBMRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",No,4,7.0
"X_train = train_df.drop(['count'], axis=1)
y_train = train_df['count']
X_test = test_df",No,5,21.0
"print(X_train.shape)
print(y_train.shape)
print(X_test.shape)",No,5,58.0
"lgbm_reg = LGBMRegressor(n_estimators=500)
lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

pred_exp = np.expm1(pred)",Yes,4,48.0
"submission = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
submission",No,5,45.0
"submission.loc[:, 'count'] = pred_exp
submission",No,5,55.0
"b""import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport os\n\ndef nl():\n    print('\\n')\n\nfor f in os.listdir('../input'):\n    print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')""",No,5,88.0
"target = df_train['Demanda_uni_equil'].tolist()

def label_plot(title, x, y):
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)

plt.hist(target, bins=200, color='blue')
label_plot('Distribution of target values', 'Demanda_uni_equil', 'Count')
plt.show()

print(""Looks like we have some pretty big outliers, let's zoom in and try again"")

print('Data with target values under 50: ' + str(round(len(df_train.loc[df_train['Demanda_uni_equil'] <= 50]) / 5000, 2)) + '%')

plt.hist(target, bins=50, color='blue', range=(0, 50))
label_plot('Distribution of target values under 50', 'Demanda_uni_equil', 'Count')
plt.show()
'",Yes,4,45.0
"import numpy as np
import pandas as pd
import math
colum_target=pd.read_csv(""../input/train.csv"",usecols=['Demanda_uni_equil'])
m=colum_target['Demanda_uni_equil'].tolist()
#print(m)
#m=np.mean(np.linalg.logm(colum_target['Demanda_uni_equil'].value+1))
#m=np.exp(math.log(+1).mean())
#mm=colum_target['m'].mean()
#print(mm)
#result_mean=pd.read_csv(""../input/test.csv"",usecols=['id'])
#result_mean['Demanda_uni_equil']=exp(mean)
#result_mean.to_csv('result_mean.csv',index=False)'",Yes,4,22.0
"x=np.exp(np.mean(np.log(np.array(m)+1)))-1
print (x)
result_logmean=pd.read_csv(""../input/test.csv"",usecols=['id'])
result_logmean['Demanda_uni_equil']=x
result_logmean.to_csv('result_logmean.csv',index=False)'",Yes,4,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
train_data=pd.read_csv(""../input/train.csv"",usecols=['Producto_ID','Demanda_uni_equil'])
'",Yes,5,45.0
"train_data['log_Dem']=np.log(np.array(train_data['Demanda_uni_equil'].tolist())+1)
#print(train_data)",No,5,8.0
"test_data['Demanda_uni_equil']=np.exp(log_target)-1
print(test_data)
test_data.to_csv('result_groupmean_log.csv',index=False,columns=['id','Demanda_uni_equil'])",Yes,5,25.0
test_data[test_data['Producto_ID']==41]['id'],No,3,41.0
"#mean_data.index
#mean_data.ix[41]
test_data.shape",No,5,58.0
"import numpy as np
import pandas as pd
from subprocess import check_output
#types={'Semana':np.uint8,'Agencia_ID':np.uint16,'Canal_ID':np.uint8,
 #      'Ruta_SAK':np.uint16,'Cliente_ID':np.uint32,'Producto_ID':np.uint16,
#       'Demanda_uni_equil':np.uint32}
types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,
         'Demanda_uni_equil':np.uint32}
#train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtype=types)
train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtype=types,nrows=1000)
print(train.dtype)
print(train.info(memery_usage=True))

",No,3,45.0
"import numpy as np
import pandas as pd
from subprocess import check_output
#types={'Semana':np.uint8,'Agencia_ID':np.uint16,'Canal_ID':np.uint8,
 #      'Ruta_SAK':np.uint16,'Cliente_ID':np.uint32,'Producto_ID':np.uint16,
#       'Demanda_uni_equil':np.uint32}
types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,
         'Demanda_uni_equil':np.uint32}
#train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtypes=types)
train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtypes=types,nrows=1000)
print(train.dtype)
print(train.info(memery_usage=True))",Yes,3,45.0
"from subprocess import check_output
import pandas as pd
print(check_output(['ls','.']).decode('utf8'))
submission=pd.read_csv('../input/sample_submission.csv')
print(submission.shape)
print(submission.columns)
print(submission.head(20))",Yes,3,45.0
"import numpy as np
import pandas as pd
import gc
import xgboost as xgb
import math
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1)) ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

nrows=10000
train=pd.read_csv('../input/train.csv',nrows=nrows)
test=pd.read_csv('../input/test.csv',nrows=nrows)

print(train.columns)
print(test.columns)
print(test.columns.values)
ids=test['id']
test=test.drop(['id'],axis=1)
y=train['Demanda_uni_equil']
X=train[test.columns.values]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1000)


params = {}
params['objective'] = ""reg:linear""
params['eta'] = 0.05
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

print ('')

test_preds = np.zeros(test.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train')]
num_rounds = 100

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

fxg_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})
submission.to_csv('submission.csv', index=False)'",Yes,3,45.0
test['count'] = (np.exp(rf.predict(test[feats])) - 1),No,5,8.0
"test[['datetime', 'count']].to_csv('submission.csv', index=False)",No,5,25.0
"__author__ = 'ZFTurbo: https://kaggle.com/zfturbo'

import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

random.seed(2016)

def run_xgb(train, test, features, target, random_state=0):
    eta = 0.3
    max_depth = 6
    subsample = 1
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        ""objective"": ""multi:softprob"",
        ""num_class"": 12,
        ""booster"" : ""gbtree"",
        ""eval_metric"": ""mlogloss"",
        ""eta"": eta,
        ""max_depth"": max_depth,
        ""subsample"": subsample,
        ""colsample_bytree"": colsample_bytree,
        ""silent"": 1,
        ""seed"": random_state,
    }
    num_boost_round = 500
    early_stopping_rounds = 50
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print(""Validating..."")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print(""Predict test set..."")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime(""%Y-%m-%d-%H-%M"")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\
')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\
'
        total += 1
        f.write(str1)
    f.close()


def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table


def read_train_test():
    # Events
    print('Read events...')
    events = pd.read_csv(""../input/events.csv"", dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv(""../input/phone_brand_device_model.csv"", dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')

    # Train
    print('Read train...')
    train = pd.read_csv(""../input/gender_age_train.csv"", dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv(""../input/gender_age_test.csv"", dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features


train, test, features = read_train_test()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_xgb(train, test, features, 'group')
print(""LS: {}"".format(round(score, 5)))
create_submission(score, test, test_prediction)
'",No,2,22.0
"import random
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from subprocess import check_output

random.seed(2010)

print(check_output(['ls', '../input']).decode('utf8'))",No,4,88.0
"def read(file_name):
    return pd.read_csv('../input/{}.csv'.format(file_name))

app_events = read('app_events')
app_labels = read('app_labels')
events = read('events')
gender_age_test = read('gender_age_test')
gender_age_train = read('gender_age_train')
label_categories = read('label_categories')
phone_brand_device_model = read('phone_brand_device_model')
sample_submission = read('sample_submission')

app_le = LabelEncoder()
app_le.fit(app_events['app_id'])

device_model_le = LabelEncoder()
device_model_le.fit(phone_brand_device_model['device_model'])",Yes,4,45.0
"import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss",No,5,23.0
"datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))",No,4,45.0
"gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])",No,5,8.0
"m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",Yes,4,20.0
"applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)",No,4,20.0
"devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()",No,3,32.0
"d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))",Yes,4,58.0
"Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,12.0
Xtrain,No,5,41.0
"targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)",No,5,20.0
"dtrain = xgb.DMatrix(Xtrain, y)",No,2,7.0
"params = {
        ""eta"": 0.1,
        ""booster"": ""gblinear"",
        ""objective"": ""multi:softprob"",
        ""alpha"": 4,
        ""lambda"": 0,
        ""silent"": 1,
        ""seed"": 1233,
        ""num_class"": 12,
        ""eval_metric"": ""mlogloss""
    }",No,5,59.0
"xgb.cv(params, dtrain, 
       num_boost_round=50, 
       #early_stopping_rounds = 5, 
       maximize = False)",No,5,28.0
"from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
#model.fit(Xtrain, y) hmm this is going to take too long.",No,5,4.0
"model = xgb.train(params, dtrain, num_boost_round=25)",No,5,7.0
"model.predict(dtest)
pred = pd.DataFrame(model.predict(dtest), index = gatest.index, columns=targetencoder.classes_)",No,5,48.0
pred.head(),No,5,41.0
"pred.to_csv('xgb_subm.csv',index=True) ",No,5,25.0
"brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))",No,4,20.0
"appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()",No,4,12.0
"clf = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf.fit(Xtrain, y)
pred = pd.DataFrame(clf.predict_proba(Xtest), index = gatest.index, columns=targetencoder.classes_)
pred.head()",No,4,49.0
"pred.to_csv('logreg_subm.csv',index=True)",No,5,25.0
"import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss",No,5,22.0
"# function to bin the timestamp in time of day

def bintod(x):
    if x < 3:
        return 0
    elif x < 8:
        return 1
    elif x < 20:
        return 2
    elif x < 23:
        return 3
    else:
        return 0 
# functions to assign region based on latitude and longitude
def lngregion(x):
    if x < 80:
        return 1
    elif x < 90:
        return 2
    elif x < 100:
        return 3
    elif x < 110:
        return 4
    elif x < 120:
        return 5
    elif x < 130:
        return 6
    elif x < 140:
        return 7
    else:
        return 0
nlng = 8
nlat = 9
def latregion(x):
    if x < 20:
        return 1
    elif x < 25:
        return 2
    elif x < 30:
        return 3
    elif x < 35:
        return 4
    elif x < 40:
        return 5
    elif x < 45:
        return 6
    elif x < 50:
        return 7
    elif x < 55:
        return 8    
    else:
        return 0      ",No,5,20.0
"datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
labelcat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
labelcat['category']=labelcat['category'].fillna('label-missing')
labelcat.head()",No,4,45.0
"# clean lat, long info, 0s are missing, also set out of china to missing
events['longitude'] = events['longitude'].round(0)
events['latitude'] = events['latitude'].round(0)

#set out of China to missing along with 0s
events['longitude'] = events['longitude'].clip_lower(73.0).replace(73.0, np.NaN) 
events['longitude'] = events['longitude'].clip_upper(135.0).replace(135.0, np.NaN) 
events['latitude'] = events['latitude'].clip_lower(15.0).replace(15.0, np.NaN)
events['latitude'] = events['latitude'].clip_upper(60.0).replace(60.0, np.NaN)

# lot of missing values - replace them with mode (most common lat, long)
events['latitude2'] =events.groupby(['device_id'])['latitude'].transform(lambda x: x.mode()) 
events['longitude2'] =events.groupby(['device_id'])['longitude'].transform(lambda x: x.mode())",No,3,17.0
"# lat long location for each device
events_latlng = events[['device_id', 'latitude2','longitude2']].drop_duplicates('device_id', keep='first')
events_latlng = events_latlng.set_index('device_id')
print('Number of devices with some lat long info',len(events_latlng['latitude2']))
print('out of that missing longitude: ', sum(events_latlng['longitude2'].isnull())) 
print('out of that missing latitude: ', sum(events_latlng['latitude2'].isnull()))

events_latlng['lng_region'] = events_latlng['longitude2'].apply(lngregion)
events_latlng['lat_region'] = events_latlng['latitude2'].apply(latregion)

print (""Frequencies longitude region:"" '\
', events_latlng['lng_region'].value_counts())
print (""Frequencies latitude region:"" '\
', events_latlng['lat_region'].value_counts())'",No,4,20.0
"appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())",No,4,20.0
"applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)

labelcat = labelcat.loc[labelcat.label_id.isin(applabels.label_id.unique())]
labelencoder = LabelEncoder().fit(labelcat.category)
labelcat['label'] = labelencoder.transform(labelcat.category)
nlabels = len(labelencoder.classes_)

print('number of unique labels:',nlabels)
print('recoded label categories', '/n',labelcat.head(n=20))

applabels=applabels.merge(labelcat[['label','label_id']],
                          how='left',left_on='label_id',right_on='label_id')

devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])                
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()",Yes,4,20.0
"Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label,
                 Xtr_tod, Xtr_dow, Xtr_lat, Xtr_lng), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label,
                 Xte_tod, Xte_dow, Xte_lat, Xte_lng), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,11.0
"targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)",No,5,20.0
"########## XGBOOST ##########

params = {}
params['booster'] = 'gblinear'
params['objective'] = ""multi:softprob""
params['eval_metric'] = 'mlogloss'
params['eta'] = 0.005
params['num_class'] = 12
params['lambda'] = 3
params['alpha'] = 2'",No,5,59.0
"clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=25)

pred = clf.predict(xgb.DMatrix(Xtest))

pred = pd.DataFrame(pred, index = gatest.index, columns=targetencoder.classes_)
pred.head()
pred.to_csv('sparse_xgb_v11.csv', index=True)",Yes,4,48.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"b""# device_iddevice\n# ga_trainga_testapp_eventstraintest\nga_train = pd.read_csv('../input/gender_age_train.csv', index_col='device_id')\nga_test = pd.read_csv('../input/gender_age_test.csv', index_col='device_id') \n# event_idapp_eventsappevent_idevent_iddevice_idga_traindevice_idgenderage\nevents = pd.read_csv('../input/events.csv', index_col='event_id', parse_dates=['timestamp']) \n# appis_installed1\napp_events = pd.read_csv('../input/app_events.csv', usecols=['event_id','app_id','is_active'])\n# phone_branddevice_model\n##### \ndevice_brand = pd.read_csv('../input/phone_brand_device_model.csv')\ndevice_brand = device_brand.drop_duplicates('device_id').set_index('device_id')\napp_labels = pd.read_csv('../input/app_labels.csv')""",No,4,45.0
"import numpy as np
import pandas as pd
import os
import gc
nrows=100000
train=pd.read_csv('../input/train.csv',nrows=nrows)
print(train.shape)
print(train.columns)
data=train.copy()
data['target']=data['Demanda_uni_equil']
data.drop(['Demanda_uni_equil'],axis=1,inplace=True)

nCliente_ID = pd.DataFrame(pd.groupby(data,['Cliente_ID','Semana'])['target'].count())
print(nCliente_ID.shape)
print(nCliente_ID.columns)
print(nCliente_ID.head(2))
nCliente_ID = nCliente_ID.reset_index()
print(nCliente_ID.shape)
print(nCliente_ID.columns)
print(nCliente_ID.head(2))
nCliente_ID.rename(columns={'target': 'nCliente_ID'}, inplace=True)
print(nCliente_ID.shape)
print(nCliente_ID.columns)
print(nCliente_ID.head(2))
nCliente_ID = pd.DataFrame(pd.groupby(nCliente_ID,['Cliente_ID'])['nCliente_ID'].mean())
print(nCliente_ID.shape)
print(nCliente_ID.columns)
print(nCliente_ID.head(2))
nCliente_ID = nCliente_ID.reset_index()
print(nCliente_ID.shape)
print(nCliente_ID.columns)
print(nCliente_ID.head(2))
 

data = pd.merge(data, nCliente_ID, 
                            how='left',
                            left_on=['Cliente_ID'], 
                            right_on=['Cliente_ID'],
                            left_index=False, right_index=False, sort=True,
                            suffixes=('_x', '_y'), copy=False) 
print(data.columns)
print(data.head(50))

del nCliente_ID
gc.collect()
print('merge completo nCliente_ID')
print(data.shape[0])",Yes,2,22.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.
import datetime
import time
from collections import defaultdict
import gc

def run_solution():
    print('Preparing arrays...')
    f = open(""../input/train.csv"", ""r"")
    f.readline()
    total = 0

    client_product_arr = defaultdict(int)
    client_product_arr_count = defaultdict(int)
    client_arr = defaultdict(int)
    client_arr_count = defaultdict(int)
    product_arr = defaultdict(int)
    product_arr_count = defaultdict(int)

    # Calc counts
    avg_target = 0.0
    while 1:
        line = f.readline().strip()
        total += 1

        if total % 10000000 == 0:
            print('Read {} lines...'.format(total))

        if line == '':
            break

        arr = line.split("","")
        week = int(arr[0])
        agency = arr[1]
        canal_id = arr[2]
        ruta_sak = arr[3]
        cliente_id = int(arr[4])
        producto_id = int(arr[5])
        vuh = arr[6]
        vh = arr[7]
        dup = arr[8]
        dp = arr[9]
        target = int(arr[10])
        avg_target += target

        client_product_arr[(cliente_id, producto_id)] += target
        client_product_arr_count[(cliente_id, producto_id)] += 1
        client_arr[cliente_id] += target
        client_arr_count[cliente_id] += 1
        product_arr[producto_id] += target
        product_arr_count[producto_id] += 1

    f.close()
    avg_target /= total
    print('Average target: ', avg_target)
    gc.collect()
    
    print('Generate submission...')
    now = datetime.datetime.now()
    path = 'submission_' + str(now.strftime(""%Y-%m-%d-%H-%M"")) + '.csv'
    out = open(path, ""w"")
    f = open(""../input/test.csv"", ""r"")
    f.readline()
    total = 0
    out.write(""id,Demanda_uni_equil\
"")

    index_both = 0
    index_client = 0
    index_product = 0
    index_empty = 0

    while 1:
        line = f.readline().strip()
        total += 1

        if total % 10000000 == 0:
            print('Write {} lines...'.format(total))

        if line == '':
            break

        arr = line.split("","")
        id = arr[0]
        week = int(arr[1])
        agency = arr[2]
        canal_id = arr[3]
        ruta_sak = arr[4]
        cliente_id = int(arr[5])
        producto_id = int(arr[6])

        out.write(str(id) + ',')
        if (cliente_id, producto_id) in client_product_arr:
            val = client_product_arr[(cliente_id, producto_id)]/client_product_arr_count[(cliente_id, producto_id)]
            out.write(str(val))
            index_both += 1
        elif cliente_id in client_arr:
            val = client_arr[cliente_id]/client_arr_count[cliente_id]
            out.write(str(val))
            index_client += 1
        elif producto_id in product_arr:
            val = product_arr[producto_id]/product_arr_count[producto_id]
            out.write(str(val))
            index_product += 1
        else:
            out.write(str(avg_target))
            index_empty += 1
        out.write(""\
"")

    print('Both: {}'.format(index_both))
    print('Client: {}'.format(index_client))
    print('Product: {}'.format(index_product))
    print('Empty: {}'.format(index_empty))

    out.close()
    f.close()

start_time = time.time()
#run_solution()
print(""Elapsed time overall: %s seconds"" % (time.time() - start_time))'",Yes,5,53.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"df_gender_age_test = pd.read_csv('../input/gender_age_test.csv', dtype={'device_id': np.str})
df_gender_age_train = pd.read_csv('../input/gender_age_train.csv', dtype={'device_id': np.str})

df_app_events = pd.read_csv('../input/app_events.csv', dtype={'app_id': np.str})
df_events = pd.read_csv('../input/events.csv', dtype={'device_id': np.str})

df_app_labels = pd.read_csv('../input/app_labels.csv', dtype={'app_id': np.str})
df_label_categories = pd.read_csv('../input/label_categories.csv')

df_phone_brands = pd.read_csv('../input/phone_brand_device_model.csv', dtype={'device_id': np.str})",No,5,45.0
df_gender_age_test.head(),No,5,41.0
"df_gender_age_test.device_id.nunique(), df_gender_age_test.shape[0]",Yes,5,54.0
df_gender_age_train.head(),No,5,41.0
"df_gender_age_train.device_id.nunique(), df_gender_age_train.shape[0]",Yes,5,54.0
df_gender_age_train.info(),No,5,40.0
df_gender_age_train.describe(include='all').T,No,5,40.0
"df_ga_full = pd.concat([df_gender_age_train, df_gender_age_test], axis=0, sort=False)",No,5,11.0
df_ga_full.device_id.nunique(),No,5,54.0
df_events.head(),No,5,41.0
"df_events.event_id.nunique(), df_events.device_id.nunique(), df_events.shape[0]",Yes,3,58.0
df_app_events.head(),No,5,41.0
"df_app_events.event_id.nunique(), df_app_events.shape[0]",Yes,4,54.0
"# df_gender_age_train.device_id[]
in_train_events = df_events[df_events.device_id.isin(set(df_gender_age_train.device_id) & set(df_events.device_id))]
in_train_app_events = df_app_events[df_app_events.event_id.isin(in_train_events.event_id)]
in_train_app_events.event_id.nunique(), in_train_app_events.event_id.size, len(in_train_events)",Yes,3,14.0
"in_test_events = df_events[df_events.device_id.isin(set(df_gender_age_test.device_id) & set(df_events.device_id))]
in_test_app_events = df_app_events[df_app_events.event_id.isin(in_test_events.event_id)]
in_train_app_events.event_id.nunique(), in_train_app_events.event_id.size, len(in_train_events)",Yes,3,14.0
"del in_train_events
del in_train_app_events
del in_test_events
del in_test_app_events",No,5,10.0
"import gc
gc.collect()",No,5,23.0
df_app_labels.head(),No,5,41.0
"df_app_labels.app_id.nunique(), df_app_labels.label_id.nunique(), df_app_labels.shape[0]",Yes,5,54.0
df_label_categories.head(),No,5,41.0
"df_label_categories.category.nunique(), df_label_categories.shape[0]",Yes,5,54.0
df_phone_brands.head(),No,5,41.0
"df_phone_brands.device_id.nunique(), df_phone_brands.shape[0]",Yes,5,54.0
"df_phone_brands.drop_duplicates(subset='device_id', inplace=True)",No,5,19.0
a.shape[0],No,5,58.0
"df_phone_brands.phone_brand = df_phone_brands.phone_brand.map(str.strip).map(str.lower)
df_phone_brands.device_model = df_phone_brands.device_model.map(str.strip).map(str.lower)
df_phone_brands.device_model = df_phone_brands.phone_brand.str.cat(df_phone_brands.device_model)",No,5,78.0
df_phone_brands.info(),No,5,40.0
df_phone_brands.describe(),No,5,40.0
"df_ga_full = df_ga_full.merge(df_phone_brands, how='left', on='device_id')",No,5,32.0
"df_train = df_ga_full.loc[df_ga_full.device_id.isin(df_gender_age_train.device_id.tolist())]
df_test = df_ga_full.loc[df_ga_full.device_id.isin(df_gender_age_test.device_id.tolist())]",No,5,13.0
"# sns.kdeplot(df_gender_age_train.age)
fig = plt.figure(figsize=(9, 6))
sns.distplot(df_gender_age_train.age, ax=fig.gca())
plt.title('Age distribution')
sns.despine()",No,5,33.0
"fig = plt.figure(figsize=(7, 4))
sns.barplot(x = df_gender_age_train.gender.value_counts().index, y=df_gender_age_train.gender.value_counts().values, ax=fig.gca())
sns.despine()
plt.title('Gender distribution')",No,5,33.0
"df_gender_age_train.groupby('group').device_id.size().sort_index(ascending=False).plot.barh(title='Age Gender Group Distribution')
sns.despine()",No,5,33.0
"share_majority = market_share[~(market_share>0.95)].index.tolist()
share_others = market_share[market_share>0.95].index.tolist()

share_majority2 = market_share2[~(market_share2>0.60)].index.tolist()
share_others2 = market_share2[market_share2>0.60].index.tolist()",No,3,13.0
"# https://seaborn.pydata.org/tutorial/categorical.html
# sns.swarmplot(x=""phone_brand"", y=""age"", hue=""gender"", data=df_train);
fig = plt.figure(figsize=(20, 6))
ax = sns.boxplot(x=""phone_brand"", y=""age"", hue=""gender"", data=df_train[df_train.phone_brand.isin(share_majority)].sort_values('age'), ax=fig.gca());
ax.set_xticklabels(share_majority, rotation=30);
str(share_majority)'",No,4,33.0
"fig = plt.figure(figsize=(20, 6))
ax = sns.boxplot(x=""device_model"", y=""age"", hue=""gender"", data=df_train[df_train.device_model.isin(share_majority2)].sort_values('age'), ax=fig.gca());
ax.set_xticklabels(ax.get_xticklabels(), rotation=30);
str(share_majority2)'",No,4,33.0
"b""# groupsgroup\n# df_app_labels.groupby('app_id').label_id.groups\ndf_app_labels = df_app_labels.groupby('app_id').label_id.apply(lambda x: ' '.join(str(s) for s in x))\ndf_app_labels.head()""",Yes,4,78.0
df_app_events = df_app_events.groupby('event_id').app_lab.apply(lambda x: ' '.join(str(s) for s in x)),No,4,78.0
"del df_label_categories
del df_app_labels",No,4,10.0
df_events['app_lab'] = df_events.event_id.map(df_app_events),No,5,8.0
df_events['timestamp'] = pd.to_datetime(df_events['timestamp']),No,5,16.0
df_events['hour'] = df_events['timestamp'].dt.hour,No,5,8.0
time_large = df_events.groupby('device_id')['hour'].apply(lambda x: max(x)),No,5,60.0
time_small = df_events.groupby('device_id')['hour'].apply(lambda x: min(x)),No,5,60.0
"from collections import Counter
time_most = df_events.groupby('device_id')['hour'].apply(lambda x: Counter(x).most_common(1)[0][0])",Yes,3,22.0
del df_app_events,No,4,10.0
"df_events.app_lab = df_events.app_lab.fillna('Missing')
df_events = df_events.groupby('device_id').app_lab.apply(lambda x: ' '.join(str(s) for s in x))",Yes,4,17.0
"df_ga_full['app_lab']= df_ga_full['device_id'].map(df_events)
df_ga_full['time_most']= df_ga_full['device_id'].map(time_most)
df_ga_full['time_large']= df_ga_full['device_id'].map(time_large)
df_ga_full['time_small']= df_ga_full['device_id'].map(time_small)",No,5,20.0
df_ga_full.head(),No,5,41.0
"del df_train
del df_test
del df_events
del df_phone_brands
del time_large
del time_most
del time_small",No,4,10.0
"fig = plt.figure(figsize=(20, 6))
ax = sns.boxplot(x=""time_most"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca());
ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0
"fig = plt.figure(figsize=(20, 6))
ax = sns.boxplot(x=""time_large"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca());
ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0
"fig = plt.figure(figsize=(20, 6))
ax = sns.boxplot(x=""time_small"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca());
ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0
"b""from sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer = CountVectorizer(binary=True)\n# NA\ndf_app_lab_vectorized = vectorizer.fit_transform(df_ga_full['app_lab'].fillna('Missing')) \n# label category feature names\nstr(vectorizer.get_feature_names())""",Yes,5,8.0
"app_labels = pd.DataFrame(df_app_lab_vectorized.toarray(), columns=vectorizer.get_feature_names(), index=df_ga_full.device_id)
app_labels.head(3)",No,4,12.0
"df_ga_full = df_ga_full.merge(app_labels, how='left', left_on='device_id', right_index=True)",No,5,32.0
df_ga_full.head(3),No,5,41.0
"df_ga_full = pd.get_dummies(df_ga_full.drop(columns=['gender', 'age', 'app_lab']), columns=['phone_brand', 'device_model', 'time_most', 'time_large', 'time_small'])",No,5,20.0
df_ga_full.shape,No,5,58.0
df_ga_full.info(),No,5,40.0
df_ga_full.describe(),No,5,40.0
"train = df_ga_full[df_ga_full.device_id.isin(df_gender_age_train.device_id)]
test = df_ga_full[df_ga_full.device_id.isin(df_gender_age_test.device_id)].drop(columns=['group'])

X = train.drop(columns=['group'])
encoder = LabelEncoder()
Y = encoder.fit_transform(train['group'])",Yes,5,21.0
"X.shape, Y.shape",No,5,58.0
"from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
# scores = cross_val_score(LogisticRegression(), X, Y, scoring='neg_log_loss',cv=10, verbose=1)",No,5,22.0
"import xgboost as xgb
from sklearn.model_selection import train_test_split

X.set_index('device_id', inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size=.80)

##################
#     XGBoost
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    ""objective"": ""multi:softprob"",
    ""num_class"": 12, # Y12
    ""booster"": ""gbtree"", # gbtree,gbliner
    ""eval_metric"": ""mlogloss"",
    ""eta"": 0.3, # GBM learning rate 
    ""silent"": 0, # 10
}
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 140, evals=watchlist, verbose_eval=True)'",Yes,3,7.0
"test.set_index('device_id', inplace=True)
y_pre = gbm.predict(xgb.DMatrix(test), ntree_limit=gbm.best_iteration)
# scores = cross_val_score(RandomForestClassifier(n_est",No,5,48.0
pd.read_csv('../input/sample_submission.csv').head(),No,5,41.0
"result = pd.DataFrame(y_pre, index=test.index, columns=encoder.classes_)
result.head()",Yes,4,12.0
result.to_csv('./predict_prob.csv'),No,5,25.0
pd.read_csv('./predict_prob.csv').head(),No,5,45.0
"import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from operator import itemgetter

# per raddar, all date features except for stations 24+25 are identical


def get_date_features():
    directory = '../input/'
    trainfile = 'train_date.csv'
    

    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          chunksize=1,
                                          low_memory=False)):
        features = list(chunk.columns)
        break

    seen = np.zeros(52)
    rv = []
    for f in features:
        if f == 'Id':
            rv.append(f)
            continue
            
        station = int(f.split('_')[1][1:])
        
        if seen[station]:
            continue
        
        seen[station] = 1
        rv.append(f)
        
    return rv
        
usefuldatefeatures = get_date_features()

def get_mindate():
    directory = '../input/'
    trainfile = 'train_date.csv'
    testfile = 'test_date.csv'
    
    features = None
    subset = None
    
    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        if features is None:
            features = list(chunk.columns)
            features.remove('Id')
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        
        if subset is None:
            subset = df_mindate_chunk.copy()
        else:
            subset = pd.concat([subset, df_mindate_chunk])
            
        del chunk

        gc.collect()

    for i, chunk in enumerate(pd.read_csv(directory + testfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        subset = pd.concat([subset, df_mindate_chunk])
        
        del chunk
        gc.collect()      
        
    return subset


df_mindate = get_mindate()

df_mindate.sort_values(by=['mindate', 'Id'], inplace=True)

df_mindate['mindate_id_diff'] = df_mindate.Id.diff()

midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan)
midr[0:-1] = -df_mindate.mindate_id_diff.values[1:]

df_mindate['mindate_id_diff_reverse'] = midr


def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
        return sup / np.sqrt(inf)


def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc


def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\\t{1}\\tq\
'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index()
    grpCount = data1.groupby(columnName)['Response'].count().reset_index()
    grpOutcomes['cnt'] = grpCount.Response
    if(useLOO):
        grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1]
    grpOutcomes.drop('cnt', inplace=True, axis=1)
    outcomes = data2['Response'].values
    x = pd.merge(data2[[columnName, 'Response']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['Response']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
        #  x = x + np.random.normal(0, .01, x.shape[0])
    return x.fillna(x.mean())


def GrabData():
    directory = '../input/'
    trainfiles = ['train_categorical.csv',
                  'train_date.csv',
                  'train_numeric.csv']
    testfiles = ['test_categorical.csv',
                 'test_date.csv',
                 'test_numeric.csv']

    cols = [['Id',
             'L1_S24_F1559', 'L3_S32_F3851',
             'L1_S24_F1827', 'L1_S24_F1582',
             'L3_S32_F3854', 'L1_S24_F1510',
             'L1_S24_F1525', 'L2_S26_F3099'],
            ['Id',
             'L3_S30_D3496', 'L3_S30_D3506',
             'L3_S30_D3501', 'L3_S30_D3516',
             'L3_S30_D3511', 'L3_S32_D3852',
             'L3_S33_D3858', 'L3_S34_D3875',
             'L3_S29_D3316'],
            ['Id',
             'L1_S24_F1846', 'L3_S32_F3850',
             'L1_S24_F1695', 'L1_S24_F1632',
             'L3_S33_F3855', 'L1_S24_F1604',
             'L3_S29_F3407', 'L3_S33_F3865',
             'L3_S38_F3952', 'L1_S24_F1723',
             'L3_S33_F3861', 'L3_S33_F3857',
             'L3_S33_F3859', 'L3_S34_F3876', 'L3_S29_F3461',
             'Response']]
    traindata = None
    testdata = None

    for i, f in enumerate(trainfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):

            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if traindata is None:
            traindata = subset.copy()
        else:
            traindata = pd.merge(traindata, subset.copy(), on=""Id"")
        del subset
        gc.collect()
    del cols[2][-1]  # Test doesn't have response!
    for i, f in enumerate(testfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if testdata is None:
            testdata = subset.copy()
        else:
            testdata = pd.merge(testdata, subset.copy(), on=""Id"")
        del subset
        gc.collect()
        
    traindata = traindata.merge(df_mindate, on='Id')
    testdata = testdata.merge(df_mindate, on='Id')
        
    testdata['Response'] = 0  # Add Dummy Value
    visibletraindata = traindata[::2]
    blindtraindata = traindata[1::2]
    print(blindtraindata.columns)
    for i in range(2):
        for col in cols[i][1:]:
            print(col)
            blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata,
                                                     blindtraindata,
                                                     col, False).values
            testdata.loc[:, col] = LeaveOneOut(visibletraindata,
                                               testdata, col, False).values
    del visibletraindata
    gc.collect()
    testdata.drop('Response', inplace=True, axis=1)
    return blindtraindata, testdata


def Train():
    train, test = GrabData()
    print('Train:', train.shape)
    print('Test', test.shape)
    features = list(train.columns)
    features.remove('Response')
    features.remove('Id')
    print(features)
    num_rounds = 50
    params = {}
    params['objective'] = ""binary:logistic""
    params['eta'] = 0.021
    params['max_depth'] = 7
    params['colsample_bytree'] = 0.82
    params['min_child_weight'] = 3
    params['base_score'] = 0.005
    params['silent'] = True

    print('Fitting')
    trainpredictions = None
    testpredictions = None

    dvisibletrain = \\
        xgb.DMatrix(train[features],
                    train.Response,
                    silent=True)
    dtest = \\
        xgb.DMatrix(test[features],
                    silent=True)

    folds = 1
    for i in range(folds):
        print('Fold:', i)
        params['seed'] = i
        watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
        clf = xgb.train(params, dvisibletrain,
                        num_boost_round=num_rounds,
                        evals=watchlist,
                        early_stopping_rounds=20,
                        feval=mcc_eval,
                        maximize=True
                        )
        limit = clf.best_iteration+1
        # limit = clf.best_ntree_limit
        predictions = \\
            clf.predict(dvisibletrain, ntree_limit=limit)

        best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                                predictions,
                                                True)
        print('tree limit:', limit)
        print('mcc:', best_mcc)
        print(matthews_corrcoef(train.Response,
                                y_pred))
        if(trainpredictions is None):
            trainpredictions = predictions
        else:
            trainpredictions += predictions
        predictions = clf.predict(dtest, ntree_limit=limit)
        if(testpredictions is None):
            testpredictions = predictions
        else:
            testpredictions += predictions
        imp = get_importance(clf, features)
        print('Importance array: ', imp)

    best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                            trainpredictions/folds,
                                            True)
    print(matthews_corrcoef(train.Response,
                            y_pred))

    submission = pd.DataFrame({""Id"": train.Id,
                               ""Prediction"": trainpredictions/folds,
                               ""Response"": train.Response})
    submission[['Id',
                'Prediction',
                'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',
                                    index=False)

    submission = pd.DataFrame({""Id"": test.Id.values,
                               ""Response"": testpredictions/folds})
    submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',
                                          index=False)
    y_pred = (testpredictions/folds > .22).astype(int)
    submission = pd.DataFrame({""Id"": test.Id.values,
                               ""Response"": y_pred})
    submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv',
                                          index=False)

if __name__ == ""__main__"":
    print('Started')
    Train()
    print('Finished')
    '",No,5,53.0
"date = pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip', nrows=10000)
numeric = pd.read_csv('../input/bosch-production-line-performance/train_numeric.csv.zip', nrows=10000)
category = pd.read_csv('../input/bosch-production-line-performance/train_categorical.csv.zip', nrows=10000)",No,5,45.0
date,No,3,41.0
numeric,No,3,41.0
category,No,3,41.0
"num_feats = ['Id',
       'L3_S30_F3514', 'L0_S9_F200', 'L3_S29_F3430', 'L0_S11_F314',
       'L0_S0_F18', 'L3_S35_F3896', 'L0_S12_F350', 'L3_S36_F3918',
       'L0_S0_F20', 'L3_S30_F3684', 'L1_S24_F1632', 'L0_S2_F48',
       'L3_S29_F3345', 'L0_S18_F449', 'L0_S21_F497', 'L3_S29_F3433',
       'L3_S30_F3764', 'L0_S1_F24', 'L3_S30_F3554', 'L0_S11_F322',
       'L3_S30_F3564', 'L3_S29_F3327', 'L0_S2_F36', 'L0_S9_F180',
       'L3_S33_F3855', 'L0_S0_F4', 'L0_S21_F477', 'L0_S5_F114',
       'L0_S6_F122', 'L1_S24_F1122', 'L0_S9_F165', 'L0_S18_F439',
       'L1_S24_F1490', 'L0_S6_F132', 'L3_S29_F3379', 'L3_S29_F3336',
       'L0_S3_F80', 'L3_S30_F3749', 'L1_S24_F1763', 'L0_S10_F219',
 'Response']",No,4,77.0
"minmaxfeatures.sort_values(by=['mindate', 'Id'], inplace=True)
minmaxfeatures['min_Id_rev'] = -minmaxfeatures.Id.diff().shift(-1)
minmaxfeatures['min_Id'] = minmaxfeatures.Id.diff()",No,2,12.0
"cols = [['Id']+date_cols,num_feats]",No,5,77.0
"traindata = None
testdata = None",No,5,77.0
"trainfiles = ['train_date.csv.zip','train_numeric.csv.zip']
testfiles = ['test_date.csv.zip','test_numeric.csv.zip']",No,5,77.0
"traindata = traindata.merge(minmaxfeatures, on='Id')
traindata = traindata.merge(data, on='Id')
testdata = testdata.merge(minmaxfeatures, on='Id')
testdata = testdata.merge(data, on='Id')",No,5,32.0
"del minmaxfeatures,data
gc.collect()",Yes,4,10.0
"train = traindata[::2]
valid = traindata[1::2]",No,5,13.0
"del traindata
gc.collect()",Yes,4,10.0
"def mcc(tp, tn, fp, fn):
    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if den == 0:
        return 0
    else:
        return num / np.sqrt(den)",No,4,49.0
"def eval_mcc(y_true, y_prob):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) 
    numn = n - nump 
    tp,fp = nump,numn
    tn,fn = 0.0,0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    return best_mcc",No,3,49.0
"def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc",No,3,49.0
"import xgboost as xgb
params = {'objective':""binary:logistic"",
          'max_depth':25,
          'base_score':0.005,
          'eval_metric':'auc',
          'n_jobs':-1
}'",Yes,5,59.0
"trainm = xgb.DMatrix(train.drop(['Response','Id'],axis=1),train['Response'])
validm = xgb.DMatrix(valid.drop(['Response','Id'],axis=1),valid['Response'])

test = xgb.DMatrix(testdata.drop(['Id'],axis=1))",No,5,21.0
"watchlist = [(trainm, 'train'), (validm, 'val')]
clf = xgb.train(params, trainm,
                num_boost_round=100,
                evals=watchlist,
                early_stopping_rounds=20,
                feval=mcc_eval,
                maximize=True
                )",Yes,5,7.0
predictions = clf.predict(validm),No,5,48.0
"fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(clf,ax=ax)",No,5,79.0
test = clf.predict(test),No,5,48.0
"testdata['Response'] = (test>best_prob).astype(int)
testdata[['Id','Response']].to_csv(""submitwoId.csv"",index=False)'",Yes,5,25.0
"import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import sys
import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"data = data[['Id','start_station','end_station']]
usefuldatefeatures = ['Id']+date_cols",No,4,77.0
"minmaxfeatures = None
for chunk in pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip',usecols=usefuldatefeatures,chunksize=50000,low_memory=False):
    features = chunk.columns.values.tolist()
    features.remove('Id')
    df_mindate_chunk = chunk[['Id']].copy()
    df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
    df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values
    df_mindate_chunk['min_time_station'] =  chunk[features].idxmin(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
    df_mindate_chunk['max_time_station'] =  chunk[features].idxmax(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
    minmaxfeatures = pd.concat([minmaxfeatures, df_mindate_chunk])

del chunk
gc.collect()",Yes,3,8.0
"for chunk in pd.read_csv('../input/bosch-production-line-performance/test_date.csv.zip',usecols=usefuldatefeatures,chunksize=50000,low_memory=False):
    features = chunk.columns.values.tolist()
    features.remove('Id')
    df_mindate_chunk = chunk[['Id']].copy()
    df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
    df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values
    df_mindate_chunk['min_time_station'] =  chunk[features].idxmin(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
    df_mindate_chunk['max_time_station'] =  chunk[features].idxmax(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
    minmaxfeatures = pd.concat([minmaxfeatures, df_mindate_chunk])

del chunk
gc.collect()",Yes,3,8.0
traindata,No,5,41.0
testdata,No,5,41.0
"traindata.fillna(value=0,inplace=True)
testdata.fillna(value=0,inplace=True)",No,5,17.0
"model = RandomForestClassifier(n_estimators=500,n_jobs=-1,verbose=1,random_state=11)
model.fit(total.drop(['Response','Id'],axis=1),total['Response'])",Yes,5,7.0
"test = model.predict(testdata.drop(['Id'],axis=1))",No,5,48.0
"testdata['Response'] = test
testdata[['Id','Response']].to_csv(""submit.csv"",index=False)'",No,5,25.0
total,No,5,41.0
"import numpy as np
from sklearn.ensemble import RandomForestClassifier
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
import os
print(os.listdir(""../input""))
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import ensemble
from sklearn.metrics import accuracy_score",Yes,5,88.0
"%%time
event_type=pd.read_csv(""../input/event_type.csv"",error_bad_lines=False)
train = pd.read_csv(""../input/train.csv"")
severity_type = pd.read_csv(""../input/severity_type.csv"")
log_feature = pd.read_csv(""../input/log_feature.csv"")
test = pd.read_csv(""../input/test.csv"")
resource_type = pd.read_csv(""../input/resource_type.csv"",error_bad_lines=False)
sample_submission = pd.read_csv(""../input/sample_submission.csv"")",No,5,45.0
"print(""test"",test.shape)
print(""train"",train.shape)",No,5,58.0
"print('test',test.head())
print('train',train.head(4))
print('sample_submission',sample_submission.head())
print('event_type',event_type.shape,event_type.head(2))
print('severity_type',severity_type.shape,severity_type.head(2))
print('log_feature',log_feature.shape,log_feature.head(2))
print('resource_type',resource_type.shape,resource_type.head(2))",No,5,41.0
"event_type['id']=pd.to_numeric(event_type['id'],errors='coerce')
#converting object datatype into numeric",No,5,16.0
event_type.dtypes,No,5,70.0
"def merge_fn(df1,df2,col_name,how_param):
    merged_df=df1.merge(df2,how=how_param,on=col_name)
    return merged_df
    ",No,5,32.0
"train_merge1=merge_fn(train,event_type.drop_duplicates(subset=['id']),'id','left')
train_merge2=merge_fn(train_merge1,severity_type.drop_duplicates(subset=['id']),'id','left')
train_merge3=merge_fn(train_merge2,log_feature.drop_duplicates(subset=['id']),'id','left')
train_merge4=merge_fn(train_merge3,resource_type.drop_duplicates(subset=['id']),'id','left')",No,5,32.0
train_merge4.shape,No,5,58.0
train_merge4.head(),No,5,41.0
train_merge4.dtypes,No,5,70.0
train_merge4.isnull().sum(),No,5,39.0
cat_col=list(set(train_merge4.columns)-set(train_merge4._get_numeric_data().columns)),No,5,77.0
"train_merge4=categorical_conversion(train_merge4,cat_col)   ",No,5,16.0
"def label_encoding_conversion(df,cat_col):
    le=preprocessing.LabelEncoder()
    for i in range(len(cat_col)):
        df[cat_col[i]]=le.fit_transform(df[cat_col[i]])
    return df",No,5,20.0
train_merge4.columns,No,5,71.0
"train_merge4=label_encoding_conversion(train_merge4,cat_col)",No,5,20.0
"train_merge4.drop(['id'],axis=1,inplace=True)",No,5,10.0
target=train_merge4[['fault_severity']],No,5,21.0
"train_merge4.drop(['fault_severity'],axis=1,inplace=True)",No,5,10.0
"test_merge1=merge_fn(test,event_type.drop_duplicates(subset=['id']),'id','left')
test_merge2=merge_fn(test_merge1,severity_type.drop_duplicates(subset=['id']),'id','left')
test_merge3=merge_fn(test_merge2,log_feature.drop_duplicates(subset=['id']),'id','left')
test_merge4=merge_fn(test_merge3,resource_type.drop_duplicates(subset=['id']),'id','left')",No,5,32.0
test_merge4.shape,No,5,58.0
severity_type.head(),No,5,41.0
test_merge4.head(2),No,5,41.0
cat_col,No,5,53.0
"test_merge4=label_encoding_conversion(test_merge4,cat_col)",No,5,20.0
test_merge4.dtypes,No,5,70.0
"test_merge4.drop(['id'],axis=1,inplace=True)",No,5,10.0
test_merge4.columns,No,5,71.0
"lr=LogisticRegression()
lr.fit(train_merge4,target)
lr_pred=lr.predict(test_merge4)
accuracy_score(pd.DataFrame(lr.predict(train_merge4)),target)",No,3,7.0
"rf=RandomForestClassifier()
rf.fit(train_merge4,target)
rf_pred=rf.predict(test_merge4)
accuracy_score(pd.DataFrame(rf.predict(train_merge4)),target)",No,3,71.0
"
nb=GaussianNB()
nb.fit(train_merge4,target)
nb.predict(test_merge4)
accuracy_score(pd.DataFrame(nb.predict(train_merge4)),target)",Yes,3,7.0
"
dt=tree.DecisionTreeClassifier()
dt.fit(train_merge4,target)
dt.predict(test_merge4)
accuracy_score(pd.DataFrame(dt.predict(train_merge4)),target)",Yes,3,7.0
"
svc_ml=svm.SVC()
svc_ml.fit(train_merge4,target)
svc_ml.predict(test_merge4)
accuracy_score(pd.DataFrame(svc_ml.predict(train_merge4)),target)",Yes,3,7.0
"
ada=AdaBoostClassifier()
ada.fit(train_merge4,target)
ada.predict(test_merge4)
accuracy_score(pd.DataFrame(ada.predict(train_merge4)),target)",Yes,3,7.0
"
knn=KNeighborsClassifier()
knn.fit(train_merge4,target)
knn.predict(test_merge4)
accuracy_score(pd.DataFrame(knn.predict(train_merge4)),target)",Yes,3,7.0
"
gb=ensemble.GradientBoostingClassifier()
gb.fit(train_merge4,target)
gb_pre=gb.predict(test_merge4)
accuracy_score(pd.DataFrame(gb.predict(train_merge4)),target)",Yes,3,7.0
"list1=[]
tuple_l=()
def data_modeling(X,target,model):
    for i in range(len(model)):
        ml=model[i]
        ml.fit(X,target)
        pred=ml.predict(X)
        acc_score=accuracy_score(pd.DataFrame(ml.predict(X)),target)
        tuple_l=(ml.__class__.__name__,acc_score)
        list1.append(tuple_l)
        print(tuple_l)
    return list1

model_score_output=data_modeling(train_merge4,target,[AdaBoostClassifier(),KNeighborsClassifier(),
svm.SVC(),RandomForestClassifier(),
tree.DecisionTreeClassifier(),
GaussianNB(),
LogisticRegression(),
ensemble.GradientBoostingClassifier()])",Yes,2,7.0
"modelscore_df=pd.DataFrame(model_score_output,columns=['Classifier',""Accuracy score""])'",Yes,5,12.0
modelscore_df,Yes,5,41.0
modelscore_df['classifier code']=np.arange(8),No,5,8.0
"modelscore_df.plot.bar(x='classifier code', y='Accuracy score', rot=0)",No,3,12.0
"predict_test=rf.predict_proba(test_merge4)
pred_df=pd.DataFrame(predict_test,columns=['predict_0', 'predict_1', 'predict_2'])
submission=pd.concat([test[['id']],pred_df],axis=1)
submission.to_csv('sub.csv',index=False,header=True)
",No,4,25.0
"import pandas as pd

train = pd.read_csv(""../input/train_users.csv"")
test = pd.read_csv(""../input/test_users.csv"")
sessions = pd.read_csv(""../input/sessions.csv"")",No,5,45.0
"for data in (train, test):
    data['year_created']  = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[:4]) )
    data['month_created'] = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[5:7]))
    data['week_created']  = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[8:10]))
    
    data['year_first']  = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[:4]))
    data['month_first'] = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[5:7]))
    data['week_first']  = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[8:10]))
",No,5,8.0
"for data in (train, test):
    data.drop(['date_account_created'], axis=1,inplace=True)
    data.drop(['date_first_booking'], axis=1,inplace=True)

train.head()",Yes,5,10.0
"train = pd.merge(train, sessions, how=""left"", left_on=[""id""], right_on=[""user_id""])
test = pd.merge(test, sessions, how=""left"", left_on=[""id""], right_on=[""user_id""])",No,5,32.0
"countries = pd.read_csv(""../input/countries.csv"")
countries.head()",Yes,4,45.0
"ga_train['trainrow'] = np.arange(ga_train.shape[0])
ga_test['testrow'] = np.arange(ga_test.shape[0])",No,4,8.0
"b""# brandtransforminverse_transformbrand\nbrand_encoder = LabelEncoder().fit(device_brand['phone_brand'])\ndevice_brand['brand'] = brand_encoder.transform(device_brand['phone_brand'])\nga_train['brand'] = device_brand['brand'] # device_iddevice_id\nga_test['brand'] = device_brand['brand']\n# devicetrainrow,brand1devicebrand\nXtr_brand = csr_matrix((np.ones(ga_train.shape[0]), (ga_train['trainrow'], ga_train['brand'])))\nXte_brand = csr_matrix((np.ones(ga_test.shape[0]), (ga_test['testrow'], ga_test['brand'])))\nprint(Xtr_brand.shape, Xte_brand.shape)""",Yes,4,20.0
"m = device_brand.phone_brand.str.cat(device_brand.device_model)
modelencoder = LabelEncoder().fit(m)
device_brand['model'] = modelencoder.transform(m)
ga_train['model'] = device_brand['model']
ga_test['model'] = device_brand['model']
Xtr_model = csr_matrix((np.ones(ga_train.shape[0]), 
                       (ga_train.trainrow, ga_train.model)))
Xte_model = csr_matrix((np.ones(ga_test.shape[0]), 
                       (ga_test.testrow, ga_test.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",Yes,4,20.0
"b""# app\napp_encoder = LabelEncoder().fit(app_events['app_id'])\napp_events['app'] = app_encoder.transform(app_events['app_id'])""",No,5,20.0
"b""napps = len(app_encoder.classes_)\n# devicetrainrow/testrow,app1deviceapp\nd = device_apps.dropna(subset=['trainrow']) # trainrowtestrowNaN\nXtr_app = csr_matrix((np.ones(d.shape[0]), (d['trainrow'], d['app'])), shape=[ga_train.shape[0],napps])\nd = device_apps.dropna(subset=['testrow']) \nXte_app = csr_matrix((np.ones(d.shape[0]), (d['testrow'], d['app'])), shape=[ga_test.shape[0],napps])\n# appdevicebrand\nprint(Xtr_app.shape, Xte_app.shape)""",No,3,17.0
"b""# appapp_labels\n# app_labelsappevents\napp_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]\napp_labels['app'] = app_encoder.transform(app_labels['app_id'])\n# label\nlabel_encoder = LabelEncoder().fit(app_labels['label_id'])\napp_labels['label'] = label_encoder.transform(app_labels['label_id'])""",No,4,20.0
"b""nlabels = len(label_encoder.classes_) # csr_matrixshape\nd = device_labels.dropna(subset=['trainrow'])\nXtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(ga_train.shape[0],nlabels))\nd = device_labels.dropna(subset=['testrow'])\nXte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), shape=(ga_test.shape[0],nlabels))\nprint(Xtr_label.shape, Xte_label.shape)""",No,3,17.0
"Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print(Xtrain.shape, Xtest.shape)",No,4,11.0
"target_encoder = LabelEncoder().fit(ga_train['group'])
y = target_encoder.transform(ga_train['group'])
nclasses = len(target_encoder.classes_)
#app_labels",No,5,20.0
"from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(n_estimators=50, max_depth=None,
      min_samples_split=2, random_state=0)",No,5,4.0
"rdf.fit(Xtrain[:70000], y[:70000])
pred = rdf.predict_proba(Xtrain[70001:])
log_loss(y[70001:], pred)",No,3,28.0
"pred_rdf = rdf.predict(Xtrain[70001:])
np.mean(pred_rdf==y[70001:])",No,4,28.0
"pred = pd.DataFrame(rdf.predict_proba(Xtrain[70001:]), index=ga_train.iloc[70001:].index, columns=target_encoder.classes_)
pred.head()",No,3,12.0
"predgroup = pd.DataFrame(y[70001:], index=ga_train.iloc[70001:].index)
predgroup.head()",No,4,12.0
"files = [
    'countries',
    'age_gender_bkts',
    'test_users',
    'train_users',
    'sessions'
]

data = {}
for f in files:
    data[f] = pd.read_csv('../input/' + f + '.csv')",No,4,45.0
"results.to_csv('dummy_results.csv', index=False)",No,5,25.0
"# Imports


# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
import xgboost as xgb


### set-up some custom functions for below
def get_year(date):
    if date == date: 
        return int(str(date)[:4])
    return date

def get_month(date):
    if date == date: 
        return int(str(date)[5:7])
    return date

def language_bucket(dataset):
    if dataset['language'] == 'en':
        val = 'en'
    else:
        val = 'non-en'
    return val
",No,3,22.0
"#### import the data
train_users  = pd.read_csv('../input/train_users.csv')
test_users    = pd.read_csv('../input/test_users.csv')
gender = pd.read_csv('../input/age_gender_bkts.csv')
sessions = pd.read_csv('../input/sessions.csv')
countries = pd.read_csv('../input/countries.csv')

all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
",No,4,45.0
"##### the age variable has a few missing values.. let's go ahead and put the average in for these
average_age  = train_users[""age""].mean()

train_users[""age""][np.isnan(train_users[""age""])]   = average_age
test_users[""age""][np.isnan(test_users[""age""])]     = average_age'",No,5,17.0
"#### looking at the country distribution through a couple of variables 
fig, (axis1, axis2, axis3, axis4, axis5, axis6) = plt.subplots(6,1,figsize=(15,30))
sns.countplot(x='country_destination', data=train_users, palette=""husl"", ax=axis1)

sns.countplot(x='signup_flow', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis2)

sns.countplot(x='affiliate_channel', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis3)

sns.countplot(x='age_range', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis4)

sns.countplot(x='signup_year', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis5)

sns.countplot(x='language_bucket', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis6)
'",No,5,33.0
"######## need to change the format of our variables so we can use the algo
# signup_method
train_users[""signup_method""] = (train_users[""signup_method""] == ""basic"").astype(int)
test_users[""signup_method""]   = (test_users[""signup_method""] == ""basic"").astype(int)

# signup_flow
train_users[""signup_flow""] = (train_users[""signup_flow""] == 3).astype(int)
test_users[""signup_flow""]   = (test_users[""signup_flow""] == 3).astype(int)

# language
train_users[""language""] = (train_users[""language""] == 'en').astype(int)
test_users[""language""]   = (test_users[""language""] == 'en').astype(int)

# affiliate_channel
train_users[""affiliate_channel""] = (train_users[""affiliate_channel""] == 'direct').astype(int)
test_users[""affiliate_channel""]   = (test_users[""affiliate_channel""] == 'direct').astype(int)

# affiliate_provider
train_users[""affiliate_provider""] = (train_users[""affiliate_provider""] == 'direct').astype(int)
test_users[""affiliate_provider""]   = (test_users[""affiliate_provider""] == 'direct').astype(int)'",No,5,16.0
"#### clense the data of non-numeric values 

from sklearn import preprocessing

for f in train_users.columns:
    if f == ""country_destination"" or f == ""id"": continue
    if train_users[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(train_users[f].values) + list(test_users[f].values)))
        train_users[f] = lbl.transform(list(train_users[f].values))
        test_users[f]   = lbl.transform(list(test_users[f].values))
##In'",No,5,20.0
"# define training and testing sets

X_train = train_users.drop([""country_destination"", ""id"", 'booked', 'age_range'],axis=1)
Y_train = train_users[""country_destination""]
X_test  = test_users.drop(['id', 'age_range'],axis=1).copy()

##In'",No,5,21.0
"# modify country_destination to numerical values

country_num_dic = {'NDF': 0, 'US': 1, 'other': 2, 'FR': 3, 'IT': 4, 'GB': 5, 'ES': 6, 'CA': 7, 'DE': 8, 'NL': 9, 'AU': 10, 'PT': 11}
num_country_dic = {y:x for x,y in country_num_dic.items()}

Y_train    = Y_train.map(country_num_dic)",No,5,20.0
"### Xgboost 

params = {""objective"": ""multi:softmax"", ""num_class"": 12}

T_train_xgb = xgb.DMatrix(X_train, Y_train)
X_test_xgb  = xgb.DMatrix(X_test)

gbm = xgb.train(params, T_train_xgb, 20)
Y_pred = gbm.predict(X_test_xgb)",No,3,20.0
"# Create submission

country_df = pd.DataFrame({
        ""id"": test_users[""id""],
        ""country"": Y_pred
    })

submission = DataFrame(columns=[""id"", ""country""])

# sort countries according to most probable destination country 
for key in country_df['country'].value_counts().index:
    submission = pd.concat([submission, country_df[country_df[""country""] == key]], ignore_index=True)

####submission.to_csv('airbnb.csv', index=False)'",No,5,55.0
"##### add ndf to everyone
ndf_only = pd.DataFrame(test_users['id'])
ndf_only['country'] = 'NDF'

##submission_final = pd.concat([submission, ndf_only])
ndf_only.to_csv('airbnb.csv', index=False)",No,4,25.0
"###### uh are the previous submissions formatted incorrectly or something?
######## checking via baseline submission
result = []
for index, row in test_users.iterrows():
    if isinstance(row['date_first_booking'], float):
        result.append([row['id'], 'NDF'])
        result.append([row['id'], 'US'])
        result.append([row['id'], 'other'])
        result.append([row['id'], 'FR'])
        result.append([row['id'], 'IT'])
    else:
        result.append([row['id'], 'US'])
        result.append([row['id'], 'other'])
        result.append([row['id'], 'FR'])
        result.append([row['id'], 'IT'])
        result.append([row['id'], 'GB'])
        
pd.DataFrame(result).to_csv('sub.csv', index = False, header = ['id', 'country'])",No,4,55.0
"##result
results = pd.DataFrame(result)
results.columns = ['id', 'country']
results[results['id'] == 'qe9gwamyfk']",No,5,55.0
"# Imports


# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn, sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
sns.set_style('whitegrid')
##%matplotlib inline


#### import the data
train_users   = pd.read_csv('../input/train_users_2.csv')
test_users    = pd.read_csv('../input/test_users.csv')
gender = pd.read_csv('../input/age_gender_bkts.csv')
sessions = pd.read_csv('../input/sessions.csv')
countries = pd.read_csv('../input/countries.csv')

##all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

mobile_browsers = []
for x in train_users['first_browser'].unique():
    if 'Mobile' in x:
        mobile_browsers.append(x)
    else:
        pass 

major_browsers = ['IE', 'Safari', '-unknown- ', 'Chrome', 'Firefox', 'Mobile']  

### group up those first_browsers
train_users['first_browser_grouped'] = np.where(train_users['first_browser'].isin(mobile_browsers), 'Mobile', train_users['first_browser'])
train_users['first_browser_grouped'] = np.where(train_users['first_browser_grouped'].isin(major_browsers), train_users['first_browser_grouped'], 'Other')

### find year of account creation
#train_users['year_account_creation'] = pd.DatetimeIndex(train_users['date_account_created']).year

### group up the first_device_type
dict_first_device_type = {""Mac Desktop"": ""Desktop"",
                          ""Windows Desktop"": ""Desktop"",
                          ""Desktop (Other)"": ""Desktop"",
                          ""iPhone"": ""Phone/Pad"",
                          ""iPad"": ""Phone/Pad"",
                          ""Android Tablet"": ""Phone/Pad"", 
                          ""Android Phone"": ""Phone/Pad"",
                          ""SmartPhone (Other)"": ""Phone/Pad""}
train_users = train_users.replace({""first_device_type"": dict_first_device_type})


######### apply the above adjustments to the test dataset
test_users['first_browser_grouped'] = np.where(test_users['first_browser'].isin(mobile_browsers), 'Mobile', test_users['first_browser'])
test_users['first_browser_grouped'] = np.where(test_users['first_browser_grouped'].isin(major_browsers), test_users['first_browser_grouped'], 'Other')
#test_users['year_account_creation'] = pd.DatetimeIndex(test_users['date_account_created']).year
test_users = test_users.replace({""first_device_type"": dict_first_device_type})

'",No,3,22.0
"language_distance = {'language' : ['en', 'du', 'fr', 'es'],
                     'levenshtein_distance_from_en' : [0, 72.61, 92.06, 92.25]}

language_distance = pd.DataFrame(language_distance)

train_users = pd.merge(train_users, language_distance, on = 'language', how = 'left')
test_users = pd.merge(test_users, language_distance, on = 'language', how = 'left')

",No,5,32.0
"
########## fill in the missing values
train_users['levenshtein_distance_from_en'].fillna(-1)
test_users['levenshtein_distance_from_en'].fillna(-1)",No,5,17.0
"##train_users['year_account_creation'] = pd.DatetimeIndex(train_users['date_account_created']).year
train_users['timestamp_first_active'] = train_users['timestamp_first_active'].astype(str)

train_users['date_account_created'] = pd.to_datetime(train_users['date_account_created'])

#### converting the first active day to a date-time var
train_users['timestamp_first_active_day'] = train_users['timestamp_first_active'].str[:8]
train_users['timestamp_first_active_day'] = pd.to_datetime(train_users['timestamp_first_active_day'], format='%Y%m%d')

#### find the first active year
train_users['timestamp_first_active_year'] = train_users['timestamp_first_active'].str[:4]
train_users['timestamp_first_active_hour'] = train_users['timestamp_first_active'].str[8:10]

#### create a var to see if they searched before joining
#train_users['searched_before_joining'] = (train_users['timestamp_first_active_day'] < train_users['date_account_created'])
#train_users['searched_before_joining'] = train_users['searched_before_joining'] * 1

#### did they do a previous trip? This appears to be a weird variable..
##train_users['first_trip'] = pd.isnull(train_users['date_first_booking']) * 1

major_languages = ['en']  
train_users['language_bucket'] = np.where(train_users['language'].isin(major_languages), 'en', 'other')

##### group up the age variable
labels = [1, 2, 3, 4, 5, 6, 7]
bins = [0, 20, 30, 40, 50, 60, 9000, 100000]
train_users['age'].fillna(10000)
train_users['age_group'] = pd.cut(train_users['age'], bins, right=False, labels=labels)
train_users['age_group'] = train_users['age_group'] * 1

train_users[""signup_combo""] = train_users[""signup_method""].map(str) + train_users[""signup_flow""].map(str)

##### let's group the affiliate_provider variable

major_affiliate_providers = ['direct', 'google', 'bing', 'craigslist', 'facebook']
train_users['affiliate_provider_grp'] = np.where(train_users['affiliate_provider'].isin(major_affiliate_providers), train_users['affiliate_provider'], 'other')
train_users[""affiliate_combined""] = train_users[""affiliate_provider_grp""].map(str) + train_users[""affiliate_channel""].map(str)


###### adjust test so it matches the adjustments made to the train dataset
test_users['timestamp_first_active'] = test_users['timestamp_first_active'].astype(str)
test_users['date_account_created'] = pd.to_datetime(test_users['date_account_created'])
test_users['timestamp_first_active_day'] = test_users['timestamp_first_active'].str[:8]
test_users['timestamp_first_active_day'] = pd.to_datetime(test_users['timestamp_first_active_day'], format='%Y%m%d')
test_users['timestamp_first_active_year'] = test_users['timestamp_first_active'].str[:4]
#test_users['searched_before_joining'] = (test_users['timestamp_first_active_day'] < test_users['date_account_created'])
#test_users['searched_before_joining'] = test_users['searched_before_joining'] * 1
##test_users['first_trip'] = pd.isnull(test_users['date_first_booking']) * 1
test_users['language_bucket'] = np.where(test_users['language'].isin(major_languages), 'en', 'other')
test_users['age'].fillna(10000)
test_users['age_group'] = pd.cut(test_users['age'], bins, right=False, labels=labels)
test_users['age_group'] = test_users['age_group'] * 1
test_users['timestamp_first_active_day'] = pd.to_datetime(test_users['timestamp_first_active_day'], format='%Y%m%d')
test_users[""signup_combo""] = test_users[""signup_method""].map(str) + test_users[""signup_flow""].map(str)
test_users['timestamp_first_active_hour'] = test_users['timestamp_first_active'].str[8:10]
test_users['affiliate_provider_grp'] = np.where(test_users['affiliate_provider'].isin(major_affiliate_providers), test_users['affiliate_provider'], 'other')
test_users[""affiliate_combined""] = test_users[""affiliate_provider_grp""].map(str) + test_users[""affiliate_channel""].map(str)

'",No,3,8.0
"#np.any(np.isnan(train_users['id']))
#print(np.all(np.isfinite(col)))

#np.isnan(train_users.any())
#np.isfinite(train_users.any())

#np.isnan(test_users.any())
#np.isfinite(test_users.any())


#train_users.head()


X_train = train_users_imputed.drop(['signup_app', 'affiliate_provider', 'affiliate_channel', 'levenshtein_distance_from_en', 'month_year_first_active', 'month_year_created', 'year_first_active', 'timestamp_first_active_year', 'country_destination', 'id', 'first_browser', 'age', 'language'], axis=1)
y_train = train_users_imputed['country_destination']
X_test = test_users_imputed.drop(['signup_app', 'affiliate_provider', 'affiliate_channel', 'levenshtein_distance_from_en', 'month_year_first_active', 'month_year_created', 'year_first_active', 'timestamp_first_active_year', 'id', 'age', 'first_browser', 'language'], axis = 1)
",No,3,21.0
"


###### look at variable importance in the model 

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print(""Feature ranking:"")

for f in indices:
    print(X_train.columns[f], importances[f])
    #print(""%d. feature %d (%f)"" % (f + 1, indices[f], importances[indices[f]]))

#for x in range(X_train.shape[1]):
#    print(X_train.columns[x])",No,5,86.0
"### alright, if none of the entries for an id is NDF, then set the 5th obs == NDF
no_ndf_ppl = top_5_records[~top_5_records['id'].isin(top_5_records['id'][top_5_records['variable'] == 'NDF'])]
ndf_ppl = top_5_records[top_5_records['id'].isin(top_5_records['id'][top_5_records['variable'] == 'NDF'])]

no_ndf_ppl_first = no_ndf_ppl.sort(['value'], ascending=[1])
no_ndf_ppl_first_ndf = no_ndf_ppl_first.groupby('id').head(1)
no_ndf_ppl_first_ndf['variable'] = 'NDF'

no_ndf_ppl_first_4 = no_ndf_ppl.sort(['value'], ascending=[0])
no_ndf_ppl_first_other = no_ndf_ppl_first_4.groupby('id').head(4)

##### combine all of the dataframes together
result = pd.concat([no_ndf_ppl_first_ndf, no_ndf_ppl_first_other , ndf_ppl])
result = result.drop(['value'], axis = 1)
result.columns = ['id', 'country']

#### create the final output dataframe
final_output_adjusted = DataFrame(columns=['id', 'country'])
final_output_adjusted = final_output_adjusted.append(result)

#### convert to csv
final_output_adjusted.to_csv('adjusted.csv', index = False, header = ['id', 'country'])
",No,2,17.0
"#Importing all dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler


#algorithms
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
",No,5,22.0
"#Loading datasets
data_dir = ""../input/""
train_users = pd.read_csv(data_dir + 'train_users_2.csv',parse_dates=['date_account_created','timestamp_first_active','date_first_booking'], index_col=False)
sessions=pd.read_csv(data_dir + 'sessions.csv')
test_users=pd.read_csv(data_dir + 'test_users.csv', parse_dates=['date_account_created','timestamp_first_active','date_first_booking'], index_col=False)'",No,5,45.0
"#Printing the columns
print(train_users.columns)
print(test_users.columns)",No,5,71.0
"#Mark test and train users and merge them for data preparation
train_users['type']='Train'
test_users['country_destination']='NULL'
test_users['type']='Test'
users = pd.concat([train_users, test_users], ignore_index=True)",No,5,11.0
"#verify the counts
print('# of train users: ',train_users.id.count())
print('# of test users: ',test_users.id.count())
print('# of total users: ',train_users.id.count()+test_users.id.count())
print('# of users: ',users.id.count())",No,5,72.0
"def feature_engineering(data):

    #Date account created
    data['Day_Acct_Created'] = data['date_account_created'].dt.day
    data['Month_Acct_Created'] = data['date_account_created'].dt.month
    data['Year_Acct_Created'] = data['date_account_created'].dt.year
    data['Hour_Acct_Created'] = data['date_account_created'].dt.hour    
    data['DayOfWeek_Acct_Created'] = data['date_account_created'].dt.dayofweek
    data['WeekOfYear_Acct_Created'] = data['date_account_created'].dt.weekofyear
    
    #Timestamp of first active
    data['Day_First_Active'] = data['timestamp_first_active'].dt.day
    data['Month_First_Active'] = data['timestamp_first_active'].dt.month
    data['Year_First_Active'] = data['timestamp_first_active'].dt.year
    data['Hour_First_Active'] = data['timestamp_first_active'].dt.hour    
    data['DayOfWeek_First_Active'] = data['timestamp_first_active'].dt.dayofweek
    data['WeekOfYear_First_Active'] = data['timestamp_first_active'].dt.weekofyear
    
    #Date of first booking
    data['Day_First_Booking'] = data['date_first_booking'].dt.day
    data['Month_First_Booking'] = data['date_first_booking'].dt.month
    data['Year_First_Booking'] = data['date_first_booking'].dt.year
    data['Hour_First_Booking'] = data['date_first_booking'].dt.hour    
    data['DayOfWeek_First_Booking'] = data['date_first_booking'].dt.dayofweek
    data['WeekOfYear_First_Booking'] = data['date_first_booking'].dt.weekofyear
    
    #Replace unknowns by NA
    data.gender.replace('-unknown-', np.nan, inplace=True)
           
    #Replace Ages
    data.loc[data.age > 95, 'age'] = np.nan
    data.loc[data.age < 13, 'age'] = np.nan
        
    #Converting categorical to numeric    
    enc = LabelEncoder()
    #data['gender_cd'] = enc.fit_transform(data['gender'])
    data['signup_method_cd'] = enc.fit_transform(data['signup_method'])
    data['language_cd'] = enc.fit_transform(data['language'])
    data['affiliate_channel_cd'] = enc.fit_transform(data['affiliate_channel'])
    data['affiliate_provider_cd'] = enc.fit_transform(data['affiliate_provider'])
    #data['first_affiliate_tracked_cd'] = enc.fit_transform(data['first_affiliate_tracked'])
    data['signup_app_cd'] = enc.fit_transform(data['signup_app'])
    data['first_device_type_cd'] = enc.fit_transform(data['first_device_type'])
    data['first_browser_cd'] = enc.fit_transform(data['first_browser'])

    #Converting the target variable as it is in category
    category_encoder = LabelEncoder()
    category_encoder.fit(data['country_destination'])
    data['country_destination_cd'] = category_encoder.transform(data['country_destination'])
    #print(category_encoder.classes_)
    
    
    return data",No,4,8.0
temp=feature_engineering(users),No,5,8.0
"#Manual feature engineering

#gender
#Converting categorial to numeric
temp.gender[temp.gender=='nan']='-1'
temp.gender[temp.gender=='MALE']='0'
temp.gender[temp.gender=='FEMALE']='1'
temp.gender[temp.gender=='OTHER']='2'

#first_affiliate_tracked
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='nan']='-1'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='untracked']='0'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='omg']='1'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='linked']='2'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='tracked-other']='3'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='product']='4'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='marketing']='5'
temp.first_affiliate_tracked[temp.first_affiliate_tracked=='local ops']='6'

temp = temp.fillna(-1)",No,5,20.0
"#Split train and test sets
train=temp[temp['type']=='Train']
test=temp[temp['type']=='Test']
print(train.id.count(),test.id.count())",No,5,13.0
"#Creating train_xx and target_xx
train_xx=train[[ 'Day_Acct_Created',
       'Month_Acct_Created', 'Year_Acct_Created', 'Hour_Acct_Created',
       'DayOfWeek_Acct_Created', 'WeekOfYear_Acct_Created', 'Day_First_Active',
       'Month_First_Active', 'Year_First_Active', 'Hour_First_Active',
       'DayOfWeek_First_Active', 'WeekOfYear_First_Active',
       'Day_First_Booking', 'Month_First_Booking', 'Year_First_Booking',
       'Hour_First_Booking', 'DayOfWeek_First_Booking',
       'WeekOfYear_First_Booking', 'signup_method_cd', 'language_cd',
       'affiliate_channel_cd', 'affiliate_provider_cd', 'signup_app_cd',
       'first_device_type_cd', 'first_browser_cd','gender','age']]
target_xx=train['country_destination_cd']
predict_xx=test[[ 'Day_Acct_Created',
       'Month_Acct_Created', 'Year_Acct_Created', 'Hour_Acct_Created',
       'DayOfWeek_Acct_Created', 'WeekOfYear_Acct_Created', 'Day_First_Active',
       'Month_First_Active', 'Year_First_Active', 'Hour_First_Active',
       'DayOfWeek_First_Active', 'WeekOfYear_First_Active',
       'Day_First_Booking', 'Month_First_Booking', 'Year_First_Booking',
       'Hour_First_Booking', 'DayOfWeek_First_Booking',
       'WeekOfYear_First_Booking', 'signup_method_cd', 'language_cd',
       'affiliate_channel_cd', 'affiliate_provider_cd', 'signup_app_cd',
       'first_device_type_cd', 'first_browser_cd','gender','age']]",No,5,21.0
target_xx.head(),No,5,41.0
"#Splitting train and test
X = train_xx
y = target_xx 
X_test = predict_xx

#Classifier
xgb = RandomForestClassifier()                
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  
",Yes,4,48.0
"test['x']=y_pred
output=test[['id','x']]
",No,5,55.0
output.head(),No,5,41.0
"%matplotlib inline
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt",No,5,23.0
"#Loading data
df_train_raw = pd.read_csv('../input/train_users_2.csv')
df_test = pd.read_csv('../input/test_users.csv')
labels = df_train_raw['country_destination'].values
df_train = df_train_raw.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]",No,4,45.0
"#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

#####Feature engineering#######
#date_account_created
df_all.date_account_created = pd.to_datetime(df_all.date_account_created)
df_all['dac_year'] = df_all.date_account_created.apply(lambda x: x.year)
df_all['dac_month'] = df_all.date_account_created.apply(lambda x: x.month)
df_all['dac_day'] = df_all.date_account_created.apply(lambda x: x.day)
df_all['dac_weekday'] = df_all.date_account_created.apply(lambda x: x.weekday())
df_all['dac_week'] = df_all.date_account_created.apply(lambda x: x.week)
df_all['dac_log_elapsed'] = np.log((datetime.date(2016, 1, 1) - df_all.date_account_created).astype('timedelta64[D]'))
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
df_all.timestamp_first_active = pd.to_datetime(df_all.timestamp_first_active, format='%Y%m%d%H%M%S')
df_all['tfa_year'] = df_all.timestamp_first_active.apply(lambda x: x.year)
df_all['tfa_month'] = df_all.timestamp_first_active.apply(lambda x: x.month)
df_all['tfa_day'] = df_all.timestamp_first_active.apply(lambda x: x.day)
df_all['tfa_weekday'] = df_all.timestamp_first_active.apply(lambda x: x.weekday())
df_all['tfa_week'] = df_all.timestamp_first_active.apply(lambda x: x.week)
df_all['tfa_log_elapsed'] = np.log((datetime.date(2016, 1, 1) - df_all.timestamp_first_active).astype('timedelta64[D]'))
df_all = df_all.drop(['timestamp_first_active'], axis=1)

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>90), -1, av)
df_all['age_year'] = np.where(av > 1900, -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
             'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type',
             'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)    ",Yes,4,8.0
"#Splitting train and test
vals = df_all.values
X_train = vals[:piv_train]
le = LabelEncoder()
y_train = le.fit_transform(labels)
X_test = vals[piv_train:]",No,3,13.0
"np.random.seed(42)
samples = np.random.choice(piv_train, 50000)
X_train = vals[samples]
y_train = le.fit_transform(labels)[samples]",No,5,21.0
"#Python Modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import operator",No,5,22.0
"# Loading data
df_train = pd.read_csv('../input/train_users_2.csv')
df_test = pd.read_csv('../input/test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]",No,3,45.0
"# Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)",Yes,4,17.0
"# date_account_created
dac = np.vstack(
    df_all.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))
        ).values
    )
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)",No,4,11.0
"# timestamp_first_active
tfa = np.vstack(
    df_all.timestamp_first_active.astype(str).apply(
        lambda x: list(map(int, [x[:4], x[4:6], x[6:8],
                                 x[8:10], x[10:12],
                                 x[12:14]]))
        ).values
    )
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)",No,3,8.0
"# One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 
             'first_affiliate_tracked', 'signup_app', 
             'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)",No,5,20.0
"# Splitting train and test
X = df_all.iloc[:piv_train,:]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = df_all.iloc[piv_train:,:]",No,4,13.0
"# Classifier
params = {'eta': 0.1,
          'max_depth': 8,
          'nround': 100,
          'subsample': 0.7,
          'colsample_bytree': 0.8,
          'seed': 1,
          'objective': 'multi:softprob',
          'eval_metric':'ndcg',
          'num_class': 12,
          'nthread':3}
num_boost_round = 10
dtrain = xgb.DMatrix(X, y)
clf1 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)",No,5,7.0
"# Get feature scores and store in DataFrame
importance = clf1.get_fscore()
importance_df = pd.DataFrame(
    sorted(importance.items(), key=operator.itemgetter(1)), 
    columns=['feature','fscore']
    )",No,5,86.0
"# Plot feature importance of top 20
importance_df.iloc[-20:,:].plot(x='feature',y='fscore',kind='barh')

# Only select features w/ a feature score (can also specify min fscore)
# Retrain model with reduced feature set
df_all = df_all[importance_df.feature.values]
X = df_all.iloc[:piv_train,:]
X_test = df_all.iloc[piv_train:,:]
dtrain = xgb.DMatrix(X, y)
clf2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)
y_pred = clf2.predict(xgb.DMatrix(X_test)).reshape(df_test.shape[0],12)",Yes,4,48.0
"# Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)",Yes,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,4,88.0
"data_train = pd.read_csv('../input/train_users_2.csv',parse_dates=['timestamp_first_active','date_account_created','date_first_booking'])
data_test = pd.read_csv('../input/test_users.csv',parse_dates=['timestamp_first_active','date_account_created','date_first_booking'])
data_train.head()",No,4,45.0
"data_test.head()
# Note that 'date_first_booking' is completely missing in test data",No,5,41.0
"# Note incorrect minimum and maximum age values
data_train.describe()",No,5,40.0
"print ('Number of lines in the training data are,',data_train.shape[0])",No,5,58.0
"countries = pd.read_csv('../input/countries.csv')
countries.head(10)",No,4,45.0
data_all.head(),No,5,41.0
print(data_all.isnull().sum()),No,5,39.0
"# Splitting date time data for date account created
data_all['dac_year'] = data_all.date_account_created.dt.year
data_all['dac_month'] = data_all.date_account_created.dt.month
data_all['dac_day'] = data_all.date_account_created.dt.day

# Splitting date time data for time first active
data_all['tfa_year'] = data_all.timestamp_first_active.dt.year
data_all['tfa_month'] = data_all.timestamp_first_active.dt.month
data_all['tfa_day'] = data_all.timestamp_first_active.dt.day

data_all.drop('date_account_created',1, inplace=True)
data_all.drop('timestamp_first_active',1, inplace=True)",No,4,8.0
data_all.describe(),No,5,40.0
data_all.gender.value_counts(dropna=False).plot(kind='bar'),No,5,33.0
"data_all.dac_year.value_counts(sort=False).plot(kind='bar', title='Number of User Accounts Created in a Year')",No,5,33.0
"data_all.tfa_year.value_counts(sort=False).plot(kind='bar', title = 'Number of Users by First Active Year')",No,5,33.0
"data_train.country_destination.value_counts(normalize=True).plot(kind='bar',title='Countries Visited by AirBNB Users')",No,5,33.0
data_all.language.value_counts(sort=True),No,5,72.0
"#Note no null data now left
print(data_all.isnull().sum())",No,5,39.0
"b""# Import sklearn.preprocessing.StandardScaler\n#from sklearn.preprocessing import MinMaxScaler\n\n# Initialize a MinMax scaler, then apply it to the numerical features\n#scaler = MinMaxScaler()\n#numerical = ['age','dac_year','dac_month','dac_day','tfa_year','tfa_month','tfa_day']\n#data_all[numerical] = scaler.fit_transform(data_all[numerical])\n\n# Create categorical columns\nfeatures = ['gender','signup_method','signup_flow','language','affiliate_channel','affiliate_provider',\\\n            'first_affiliate_tracked','signup_app','first_device_type','first_browser']\n\n# get dummies\ndata_all = pd.get_dummies(data_all,columns=features)\n""",No,5,20.0
"#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_ids)):
    idx = test_ids[i]
    ids += [idx] * 5
    cts += labler.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('submission.csv',index=False)",No,5,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,5,88.0
"# Load the data into DataFrames
train_users = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv')
test_users = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv')",No,5,45.0
"print(""Number of users in training set ="", train_users.shape[0] )
print(""Number of users in test set ="",test_users.shape[0])",No,5,58.0
train_users.head(),No,5,41.0
train_users.describe(include = 'all'),No,5,40.0
test_users.head(),No,5,41.0
test_users.describe(include = 'all'),No,5,40.0
"labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination', 'date_first_booking'], axis=1)
test_users = test_users.drop(['date_first_booking'], axis=1)
id_test = test_users['id']

# Merge train and test users
all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

# Remove ID's since now we are not interested in making predictions
all_users.drop('id',axis=1, inplace=True)

all_users.head()",No,4,10.0
"from datetime import datetime
all_users['date_account_created'] = pd.to_datetime(all_users['date_account_created'])
all_users['timestamp_first_active'] = pd.to_datetime((all_users.timestamp_first_active // 1000000), format='%Y%m%d')

all_users['date_account_created'] = [datetime.timestamp(d) for d in all_users['date_account_created']]
all_users['timestamp_first_active'] = [datetime.timestamp(d) for d in all_users['timestamp_first_active']]",No,5,16.0
all_users.age.describe(),No,5,40.0
"sns.distplot(all_users.age.dropna())
plt.xlabel('Age')",No,5,33.0
"sns.distplot(all_users.age.loc[all_users['age'] < 70].dropna())
plt.xlabel('Age')",No,5,33.0
"all_users['age'] = np.where(all_users['age']<=14, 14, all_users['age'])
all_users['age'] = np.where(all_users['age']>=70, 70, all_users['age'])
all_users['age'] = all_users['age'].fillna(all_users['age'].dropna().values.mean())
all_users['age'].describe()",No,4,8.0
all_users['age'].values.mean(),No,5,40.0
"categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method'
]

# one-hot-encoding
for categorical_feature in categorical_features:
    all_users_dummies = pd.get_dummies(all_users[categorical_feature], prefix=categorical_feature)
    all_users = all_users.drop([categorical_feature], axis=1)
    all_users = pd.concat((all_users, all_users_dummies), axis=1)",No,4,20.0
all_users.head(),No,5,41.0
"from sklearn.preprocessing import LabelEncoder

train_users_n = train_users.shape[0]
X_train = all_users.values[:train_users_n]
le = LabelEncoder()
y_train = le.fit_transform(labels)   
X_test = all_users.values[train_users_n:]",No,5,20.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
%matplotlib inline
sns.set_style(""white"")
sns.set_context('talk')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,4,88.0
"# Read The Files
train = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv')
age = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv')
countries = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/countries.csv')
sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv')
test = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv')",No,5,45.0
"print(train.describe())
print(train.info())",No,5,40.0
"print(test.describe())
print(test.info())",No,5,40.0
"#Join the test & train data to fix them both at the same time

df = train.append(test, ignore_index = True, sort = True)
print(df.info())
",No,4,11.0
"#Plotting distribution of the data

for x in cat:
    sns.countplot(x=x, data=df,palette='RdBu')
    plt.ylabel('Number of users')
    plt.title('Users '+ x + ' Distribution')
    plt.xticks(rotation='vertical')
    plt.show()
    plt.savefig('plot'+str(x)+'.png')
    ",No,5,33.0
"# Investigate The Time Users Spend Between Being First Active and Actually Making A reservation

df['timestamp_first_active'] = pd.to_datetime((df.timestamp_first_active // 1000000), format='%Y%m%d')
df['date_first_booking'] = pd.to_datetime(df['date_first_booking'])
df['time_to_booking']= df['date_first_booking'] - df['timestamp_first_active']
print(df.time_to_booking.describe())",No,4,16.0
"# Investigate Month and Year Of Users Bookings And Signing up to see most active years/months

df['month_booking']= df.date_first_booking.dt.month
df['year_booking']= df.date_first_booking.dt.year
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['month_create']=df.date_account_created.dt.month
df['year_create']=df.date_account_created.dt.year",No,4,16.0
"for x in ['month_booking','year_booking','month_create','year_create'] :
    sns.countplot(x=x,data=df)
    plt.xticks(rotation='vertical')
    plt.show()
    plt.savefig('plot'+str(x)+'.png')
",No,5,33.0
"df.date_account_created.value_counts().plot(kind='line')
plt.xlabel('Date')
plt.title('New Accounts Created Over Time')
plt.xticks(rotation='vertical')
plt.show()
plt.savefig('plot New Accounts Created Over Time.png')
",No,5,75.0
"new2 = sessions.groupby('user_id').count()
print(new2.describe())",No,4,60.0
"#Drop Year Column because it's the same for all entries (2015)
age = age.drop('year',axis = 1)",No,5,10.0
"# Group and Plot Age Data 
g = age.groupby(['age_bucket','gender']).sum().reset_index().sort_values('population_in_thousands')
sns.set_context('talk')
sns.barplot(x='age_bucket',y = 'population_in_thousands',data=g)
plt.xticks(rotation='vertical')
plt.title('Different Age Groups')
plt.show()
plt.savefig('plot Different Age Groups.png')

",No,5,33.0
"# The Age Data

#set any value bigger than 130 or lower than 18 to be nan
df.age[df.age > 110] = np.nan
df.age[df.age < 18] = np.nan

#Replace Missing age data with the mean 
df.loc[df['age'].isnull(),'age'] = df.age.median()",No,5,17.0
"#look at age distribution
sns.distplot(df.age)
plt.title('Age Distribution Of Users')
plt.show()",No,5,33.0
"#Extract the remaining date information
df['month_active']= df.timestamp_first_active.dt.month
df['year_active']= df.timestamp_first_active.dt.year",No,5,8.0
"#Drop unnecessary columns after the extraction of useful data
df1 = df.drop(['date_first_booking','time_to_booking','month_booking','year_booking','date_account_created',
              'timestamp_first_active','timestamp_first_active','country_destination','id'],axis=1)",No,5,10.0
"# Handle categorical Columns
ndf = pd.get_dummies(df1,columns=['affiliate_channel','affiliate_provider','first_affiliate_tracked',
                                'first_browser','first_device_type','language','signup_app','year_active'
                                ,'signup_flow','signup_method','month_create','year_create','month_active'],
                     drop_first =True,dtype='float16')",No,5,20.0
ndf.head(),No,5,41.0
"from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(xtrn1,ytrn1)",No,5,7.0
yprd = neigh.predict(xtst1),No,5,48.0
"yprd1 = pd.DataFrame(yprd)
yprd1.index = ytst1.index
xtfinal = pd.concat([yprd1,xtst1],axis=1)",Yes,5,11.0
"xtfinal.rename(columns={0:'gender'},inplace = True )",No,5,61.0
"xtrain_final = pd.concat([ytrn1,xtrn1],axis=1)",No,5,11.0
"xfinal = xtrain_final.append(xtfinal)
xfinal = pd.concat([xfinal,df.country_destination],axis=1)",No,5,11.0
xfinal.head(),No,5,41.0
"xy = pd.get_dummies(xfinal,columns=['gender'],drop_first =True,dtype='float16')
",No,5,20.0
"from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB",No,5,22.0
"sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)",No,5,18.0
"from xgboost.sklearn import XGBClassifier

clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
clf.fit(x_train,y_train)",No,5,7.0
y_pred = clf.predict(x_test),No,5,48.0
"submission = pd.DataFrame({'id':test['id'],'country':y_pred})
submission.head()

filename = 'Airbnb Predictions 1_1.csv'
submission.to_csv(filename,index=False)",No,4,25.0
"df = train.append(test, ignore_index = True, sort = True)
",No,5,11.0
"df.age[df.age > 110] = np.nan
df.age[df.age < 18] = np.nan

#Replace Missing age data with the mean 
df.loc[df['age'].isnull(),'age'] = -1",No,5,17.0
"# Extracting Age Data As before

df['timestamp_first_active'] = pd.to_datetime((df.timestamp_first_active // 1000000), format='%Y%m%d')
df['day_active'] = df.timestamp_first_active.dt.day
df['month_active']= df.timestamp_first_active.dt.month
df['year_active']= df.timestamp_first_active.dt.year

df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['day_create'] = df.date_account_created.dt.day
df['month_create']=df.date_account_created.dt.month
df['year_create']=df.date_account_created.dt.year",No,5,8.0
"ndf2 = df.drop(['date_first_booking','date_account_created',
              'timestamp_first_active','timestamp_first_active','id'],axis = 1 )",No,5,10.0
"xy2 = pd.get_dummies(ndf2,columns=['affiliate_channel','affiliate_provider','first_affiliate_tracked',
                                'first_browser','first_device_type','gender','language','signup_app','signup_flow'
                                ,'signup_method'])",No,5,20.0
"from xgboost.sklearn import XGBClassifier

clf = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=50,class_weight=class_weight ,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
clf.fit(x_train,y_train)",No,5,7.0
"submission = pd.DataFrame({'id':test['id'],'country':y_pred})
submission.head()

filename = 'Airbnb Predictions 1_2.csv'
submission.to_csv(filename,index=False)",No,4,25.0
"train = pd.merge(train, countries, how=""left"", left_on=[""country_destination""], right_on=[""country_destination""])",No,5,32.0
"x_train = train.drop(""country_destination"", axis=1)
y_train = train[""country_destination""]
x_test = test.copy()",Yes,5,21.0
"import numpy as np

from sklearn import preprocessing

for f in x_train.columns:
    if x_train[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        if f not in x_test.columns:
            lbl.fit(np.unique(list(x_train[f].values)))
            x_train[f] = lbl.transform(list(x_train[f].values))
        else:
            lbl.fit(np.unique(list(x_train[f].values) + list(x_test[f].values)))
            x_train[f] = lbl.transform(list(x_train[f].values))
            x_test[f] = lbl.transform(list(x_test[f].values))",Yes,5,20.0
"for col in countries.columns:
    if col == 'country_destination':
        continue
    del(x_train[col])",No,4,10.0
"from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
x_train_nonan = imp.fit_transform(x_train)


x_test_nonan = imp.fit_transform(x_test)",Yes,5,17.0
"from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()


model.fit(x_train_nonan, y_train)",Yes,5,7.0
y_test = model.predict(x_test_nonan),No,5,48.0
"submission = pd.DataFrame()
submission[""id""]          = test[""id""]
submission[""country""] = y_test

submission.to_csv('airbnb.csv', index=False)'",Yes,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder",No,5,22.0
"train = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
age_gender = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')
sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')",No,5,45.0
"X_train = train.drop(['date_first_booking', 'country_destination'], axis=1)
X_test = test.drop(['date_first_booking'], axis=1)",No,5,10.0
"y_des = train['country_destination'].values
X=pd.concat((X_train, X_test), axis=0, ignore_index=True)
X.shape",Yes,4,11.0
X.fillna(method='pad').head(),No,4,17.0
"X.loc[X.age > 90, 'age'] = -1
X.loc[X.age < 13, 'age'] = -1
X['age'].describe()",Yes,4,8.0
"X.loc[X.age.isnull(), 'age']=X.age.mean()",No,5,17.0
"dac = np.vstack(
    X.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))
    ).values
)
X['dac_year'] = dac[:, 0]
X['dac_month'] = dac[:, 1]
X['dac_day'] = dac[:, 2]
X = X.drop(['date_account_created'], axis=1)
X.head()",Yes,4,8.0
"df = sessions.user_id.value_counts()
print(df.shape)
print(df.head())",Yes,3,72.0
"df = df.to_frame()
df = df.rename(columns = {'user_id' : 'session_count'})
df['id'] = df.index
df.head()",Yes,4,61.0
"X = pd.merge(X, df, how = 'left', on = ['id'])
X.session_count.fillna(-1, inplace = True)
X.session_count = X.session_count.astype(int)",Yes,3,32.0
"tfa = np.vstack(
    X.timestamp_first_active.astype(str).apply(
        lambda x: list(map(int, [x[:4], x[4:6], x[6:8],
                                 x[8:10], x[10:12],
                                 x[12:14]]))
    ).values
)
X['tfa_year'] = tfa[:, 0]
X['tfa_month'] = tfa[:, 1]
X['tfa_day'] = tfa[:, 2]
X = X.drop(['timestamp_first_active'], axis=1)",Yes,4,8.0
"# age distributions
train['corrected_age']=train['age'].apply(lambda x : 36 if x>90 or x<10 else x)
sns.distplot(train.corrected_age.dropna())",Yes,5,33.0
"# percentage of users using different signup_method
signup_method = X.signup_method.value_counts(dropna = False) / len(X) * 100
signup_method.plot('bar', rot = 0)
plt.xlabel('Sign up method')
plt.ylabel('Percentage of signup_method')",Yes,5,33.0
"# percentage of gender
gender = X.gender.value_counts(dropna = False) / len(X) * 100
gender.plot('bar', rot = 0)
plt.xlabel('gender')
plt.ylabel('Percentage of gender')",Yes,5,33.0
"# percentage of people going to different countries
des_countries = train.country_destination.value_counts(dropna = False) / len(train) * 100
des_countries.plot('bar', rot = 0)
plt.xlabel('Destination country')
plt.ylabel('Percentage of booking')",Yes,5,33.0
"# Relavance between Age and destination
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 7)
sns.boxplot(y='age' , x='country_destination',data=train)
plt.xlabel('Destination Country box plot',size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)",No,4,81.0
"# relevance between age and signup method
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='signup_method',data=train)
plt.xlabel('Signup method', size=15)
plt.ylabel('age', size=15)
plt.tick_params(labelsize=12)
#sns.despine()",No,4,81.0
"# relevence between age and signup app
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='signup_app',data=train)
plt.xlabel('Signup app',size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)
#sns.despine()",No,4,81.0
"#relevence between age and language
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
sns.boxplot(y='age' , x='language',data=train)
plt.xlabel('Language', size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)
#sns.despine()",No,4,81.0
"# relevance between age and gender
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='gender',data=train)
plt.xlabel('Gender', size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=10)
#sns.despine()",No,4,81.0
"# chart for number of account created
train['date_account_created_new'] = pd.to_datetime(train['date_account_created'])
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
train.date_account_created_new.value_counts().plot(kind='line', linewidth=1, color='#1F618D')
plt.xlabel('Date ', size=20)
plt.ylabel('Number of account created ', size=15)
plt.tick_params(labelsize=12)
#sns.despine()",No,5,75.0
"oh_features = ['gender', 'signup_method', 'signup_flow', 'language',
                'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app',
                'first_device_type', 'first_browser']",No,5,77.0
"for feature in oh_features:
    X_dummy = pd.get_dummies(X[feature], prefix=feature)
    X = X.drop([feature], axis=1)
    X = pd.concat((X, X_dummy), axis=1)
X.head()",Yes,3,20.0
"#split the well processed dataset into X_train and X_test
X_train = X.iloc[:len(train), :]
X_test = X.iloc[len(train):, :]
X_train = X_train.drop(['id'], axis=1)
X_train.shape
X_test = X_test.drop(['id'], axis=1)",Yes,4,13.0
"le = LabelEncoder()
y_trans = le.fit_transform(y_des)
y_trans.shape",Yes,4,20.0
"dtrain, dtest, train_label, test_label = train_test_split(X_train, y_trans, test_size = 0.3, random_state = 817)",No,5,13.0
"#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(dtrain, train_label)
pred_log=logreg.predict(dtest)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_label, pred_log))",Yes,3,7.0
"from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=20, n_estimators=100)
rfc.fit(dtrain , train_label)
pred = rfc.predict(dtest)
print(accuracy_score(test_label, pred))",Yes,3,7.0
"fi=pd.Series(rfc.feature_importances_, index=dtrain.columns)
fn=fi.sort_values(ascending=True)
fn[-20:].plot(kind='barh', color='r', figsize=(25, 12))
plt.xlabel('importance', size=15)
plt.title('Random Forest Importance', size=20)
plt.tick_params(labelsize=15)",No,5,79.0
"from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(dtrain , train_label)
pred = dtc.predict(dtest)
print(accuracy_score(test_label, pred))",Yes,3,7.0
"from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40)
xgb.fit(dtrain , train_label)
pred = xgb.predict(dtest) 
print(accuracy_score(test_label, pred))",Yes,3,7.0
"# only XGBoost
xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40)
xgb.fit(X_train, y_trans)
XGBC_pred_test = xgb.predict(X_test)
XGBC_pred_test_prob=xgb.predict_proba(X_test)",Yes,3,7.0
"ids_test = test['id']

ids = []
countries = []

for i in range(len(X_test)):
    idx = ids_test[i]
    ids += [idx] * 5
    countries += le.inverse_transform(np.argsort(XGBC_pred_test_prob[i])[::-1][:5]).tolist()",No,5,53.0
"submission = pd.DataFrame({
    ""id"" : ids,
    ""country"" : countries
})
submission.to_csv('submission_XGBC.csv', index = False)'",No,5,25.0
"n_labels=len(set(y_des))
n_labels",No,5,77.0
"params = {
    'objective': 'multi:softprob',
    'eval_metric': 'merror',
    'num_class': n_labels,
    'eta': 0.5,
    'max_depth': 6,
    'subsample': 0.5,
    'colsample_bytree': 0.3,
    'silent': 1,
    'seed': 123
}",No,5,59.0
"import xgboost as xgb
num_boost_round = 50

Dtrain = xgb.DMatrix(X_train, y_trans)
res = xgb.cv(params, Dtrain, num_boost_round=num_boost_round, nfold=5,
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(50)])",No,5,28.0
"num_boost_round = res['test-merror-mean'].idxmin()
print(format(num_boost_round))
clf = xgb.train(params, Dtrain, num_boost_round=num_boost_round)
clf",Yes,5,7.0
"import operator
importance = clf.get_fscore()
importance_df = pd.DataFrame(
    sorted(importance.items(), key=operator.itemgetter(1)),
    columns=['feature', 'fscore']
)",Yes,5,79.0
"importance_df = importance_df.iloc[-20:, :]",No,5,14.0
"plt.figure()
importance_df.plot(kind='barh', x='feature', y='fscore',
                   legend=False, figsize=(20, 10))
plt.title('XGBoost Feature Importance', size=25)
plt.xlabel('Relative importance', size=20)
plt.ylabel('Features', size=20)
plt.tick_params(labelsize=15)
#plt.gcf().savefig('feature_importance.png')",No,5,79.0
"#importing neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
",No,5,23.0
"sessions=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
countries=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
age_gender=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')
submission=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sample_submission_NDF.csv.zip')",No,5,45.0
"test_ids = data_test['id']
Nrows_train = data_train.shape[0]  

# Store country names
labels = data_train['country_destination'].values
data_train1 = data_train.drop(['country_destination'], axis=1)

# Combining the test and train data. If this is not done, the number of dummy variable columns do not match in test and train data.
# Some items present in train data and are not present in test data. For example, browser type. 
data_all = pd.concat((data_train1, data_test), axis = 0, ignore_index = True)

# Dropping ids which are saved separately and date of first booking which is completely absent in the test data
data_all = data_all.drop(['id','date_first_booking'], axis=1)
",Yes,3,10.0
"data_all.loc[data_all.age > 100, 'age'] = np.nan
data_all.loc[data_all.age < 18, 'age'] = np.nan",No,5,8.0
"data_all.groupby('gender').age.agg(['min','max','mean','count'])",No,5,60.0
data_all.groupby('gender').age.mean().plot(kind='bar'),No,5,33.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from datetime import datetime
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0
"train=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
#sessions=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
#countries=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
#age_gender=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')",No,5,45.0
"b""df.info() # Bo deeri olan stunlar belirlemek ve data tiplerini anlamak iin info'ya bakyorum. """,No,5,40.0
train.info() ,No,5,40.0
df['aged'].describe() ,No,5,40.0
"plt.figure(figsize=(30,15))
sns.countplot(x='aged', data=df)",No,5,33.0
"b""df.loc[df['aged']>=85, 'aged']=np.nan #grafie baknca 85'ten sonrasn almamaya karar verdim, onlar da NaN ile doldurdum.""",No,5,8.0
df['aged'].describe(),No,5,40.0
"b""df.isnull().sum() # test datas dnda hala bo hcresi olan 'first_affiliate_tracked' grnyor.""",No,5,39.0
"b""df['first_active_date']=first_active_date # timestamp_first_active'den first active date ve time' ayrarak ayr iki stun oluturdum\ndf['first_active_time']=first_active_time\ndf=df.drop(['timestamp_first_active'], axis=1)""",No,4,8.0
"df.select_dtypes(""object"").columns",No,5,71.0
"
le = LabelEncoder() 
df['signup_method']= le.fit_transform(df['signup_method']) 
df['language']= le.fit_transform(df['language'])
df['affiliate_channel']= le.fit_transform(df['affiliate_channel'])
df['affiliate_provider']= le.fit_transform(df['affiliate_provider'])
df['signup_app']= le.fit_transform(df['signup_app'])
df['first_device_type']= le.fit_transform(df['first_device_type'])
df['gender']= le.fit_transform(df['gender'])
df['first_browser']= le.fit_transform(df['first_browser'])
df['first_affiliate_tracked']= le.fit_transform(df['first_affiliate_tracked'])",No,5,20.0
"df['country_destination'].replace('US',1, inplace=True)
df['country_destination'].replace('other',2, inplace=True)
df['country_destination'].replace('FR',3, inplace=True)
df['country_destination'].replace('CA',4, inplace=True)
df['country_destination'].replace('GB',5, inplace=True)
df['country_destination'].replace('ES',6, inplace=True)
df['country_destination'].replace('IT',7, inplace=True)
df['country_destination'].replace('PT',8, inplace=True)
df['country_destination'].replace('DE',9, inplace=True)
df['country_destination'].replace('NL',10, inplace=True)
df['country_destination'].replace('AU',11, inplace=True)",No,5,20.0
"b""dft=df[df['Train']==1] #sadece train datasn almak iin\nX=dft.drop(['country_destination','Train', 'dfb_year', 'dfb_month', 'dfb_day'],axis=1) # feature tanmlama\nY = dft['country_destination'] # target tanmlama""",No,5,21.0
"x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.3, random_state = 42)  #data ayrma'",No,5,13.0
"from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='entropy', max_depth= 8, max_leaf_nodes=30, min_samples_leaf=30, n_estimators= 100, random_state=0)

rfc.fit(x_train, y_train)
prediction = pd.DataFrame(data=rfc.predict(x_test), index = x_test.index)
classification_metrics(y_test, prediction)",No,3,7.0
"pred_country={ 1:""US"", 2:""other"", 3:""FR"", 4:""CA"", 5:""GB"", 6:""ES"", 7:""IT"", 8:""PT"", 9:""DE"", 10:""NL"", 11:""AU""}",No,5,77.0
"b""dftest=df[df['Train']==0]# submission yapmak iin hazrlk""",No,5,14.0
tested = xgb.predict(testX)# submission yapmak iin hazrlk',No,5,48.0
"results=[]
for i in tested:
    results.append(pred_country[i])
print(results)",No,5,53.0
"my_submission = pd.DataFrame({'id': mysubmission_ID, 'country':results})
my_submission.to_csv('submission.csv', index=False)",No,5,25.0
"# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb",No,5,23.0
"# get homesite & test csv files as a DataFrame
homesite_df = pd.read_csv(""../input/train.csv"")
test_df     = pd.read_csv(""../input/test.csv"")

# preview the data
homesite_df.head()",No,5,45.0
"homesite_df.info()
print(""----------------------------"")
test_df.info()",No,5,40.0
"# drop unnecessary columns, these columns won't be useful in analysis and prediction
homesite_df = homesite_df.drop(['QuoteNumber'], axis=1)",No,5,10.0
"# date

# Convert Date to Year, Month, and Week
homesite_df['Year']  = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4]))
homesite_df['Month'] = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7]))
homesite_df['Week']  = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10]))

test_df['Year']  = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7]))
test_df['Week']  = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10]))

homesite_df.drop(['Original_Quote_Date'], axis=1,inplace=True)
test_df.drop(['Original_Quote_Date'], axis=1,inplace=True)",No,4,8.0
"# customers purchased insurance plan

# Plot
sns.countplot(x=""QuoteConversion_Flag"", data=homesite_df)",No,5,33.0
"# year
# Which year has higher number of customers purchased insurance plan

# Plot
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))

sns.countplot(x=""QuoteConversion_Flag"",hue=""Year"", data=homesite_df, ax=axis1)
sns.countplot(x=homesite_df[""Year""].loc[homesite_df[""QuoteConversion_Flag""] == 1], 
              order=[2013,2014,2015], ax=axis2)",No,5,33.0
"# month
# Which month has higher number of customers purchased insurance plan

# Plot
sns.countplot(x=homesite_df[""Month""].loc[homesite_df[""QuoteConversion_Flag""] == 1], 
              order=[1,2,3,4,5,6,7,8,9,10,11,12])",No,5,33.0
"# fill NaN values

homesite_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)",No,5,17.0
"# There are some columns with non-numerical values(i.e. dtype='object'),
# So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set.

from sklearn import preprocessing

for f in homesite_df.columns:
    if homesite_df[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(homesite_df[f].values) + list(test_df[f].values)))
        homesite_df[f] = lbl.transform(list(homesite_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))",No,5,20.0
"# define training and testing sets

X_train = homesite_df.drop(""QuoteConversion_Flag"",axis=1)
Y_train = homesite_df[""QuoteConversion_Flag""]
X_test  = test_df.drop(""QuoteNumber"",axis=1).copy()",No,5,21.0
"# Create submission

submission = pd.DataFrame()
submission[""QuoteNumber""]          = test_df[""QuoteNumber""]
submission[""QuoteConversion_Flag""] = Y_pred

submission.to_csv('homesite.csv', index=False)'",No,4,25.0
"import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import preprocessing

df = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")

df['Date']=pd.to_datetime(pd.Series(df['Original_Quote_Date']))
df['Year']=df['Date'].apply(lambda x: int(str(x)[:4]))
df['Month']=df['Date'].apply(lambda x: int(str(x)[5:7]))
df['Date']=df['Date'].apply(lambda x: int(str(x)[8:10]))
df['Field10'].apply(lambda x : int(x.replace(',','')) )

test['Date']=pd.to_datetime(pd.Series(test['Original_Quote_Date']))
test['Year']=test['Date'].apply(lambda x: int(str(x)[:4]))
test['Month']=test['Date'].apply(lambda x: int(str(x)[5:7]))
test['Date']=test['Date'].apply(lambda x: int(str(x)[8:10]))
test['Field10'].apply(lambda x : int(x.replace(',','')))

label=df['QuoteConversion_Flag']
df.drop('QuoteConversion_Flag',axis=1,inplace=True)
number=test['QuoteNumber']
drop_columns=['Original_Quote_Date','QuoteNumber']
for names in drop_columns:
        df.drop(names,axis=1,inplace=True)
        test.drop(names,axis=1,inplace=True)
clf=xgb.XGBClassifier(max_depth=7,learning_rate=0.03,n_estimators=650,subsample=0.86,seed=50)'",No,3,8.0
"for f in df.columns:
    if df[f].dtypes=='object':
        encoder=preprocessing.LabelEncoder()
        encoder.fit( list(df[f])+list(test[f]) )
        df[f]=encoder.transform(list(df[f].values))
        test[f]=encoder.transform(list(test[f].values))


df.fillna(-1,inplace=True)
test.fillna(-1,inplace=True)",No,4,20.0
"clf.fit(df,label)
output=clf.predict_proba(test)[:,1]",No,4,48.0
"sample=pd.read_csv('../input/sample_submission.csv')
sample.QuoteConversion_Flag=output
sample.to_csv('final.csv',index=False)",No,4,25.0
"# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

# machine learning

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import maxabs_scale
import xgboost as xgb

",No,5,23.0
"# fill NaN values
homesite_df.fillna(homesite_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)
     
# define training and testing sets
X_train = homesite_df.drop(""QuoteConversion_Flag"",axis=1)
Y_train = homesite_df[""QuoteConversion_Flag""]
X_test  = test_df.drop(""QuoteNumber"",axis=1).copy()

X_train = maxabs_scale(X_train)
X_test = maxabs_scale(X_test)",No,4,21.0
"import pandas as pd
from collections import Counter

%pylab inline
",No,5,23.0
"trainPath = r'../input/train.csv'
testPath = r'../input/test.csv'",No,5,77.0
"train_df = pd.read_csv(trainPath)
test_df = pd.read_csv(testPath)",No,5,45.0
"#deal character data
train_df['Year']  = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4]))
train_df['Month'] = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7]))
train_df['Week']  = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10]))

test_df['Year']  = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7]))
test_df['Week']  = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10]))

train_df.drop(['Original_Quote_Date'], axis=1,inplace=True)
test_df.drop(['Original_Quote_Date'], axis=1,inplace=True)",No,4,8.0
"#get character data columns
notNumTypeCol = [col for col in train_df.columns if train_df[col].dtype == dtype('O')]",No,5,77.0
"train_x_df = train_df.drop(""QuoteConversion_Flag"",axis=1)
train_y_df = train_df[""QuoteConversion_Flag""]",No,5,21.0
"train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")",No,5,45.0
"train = train.drop('QuoteNumber', axis=1) 
test = test.drop('QuoteNumber', axis=1)

# Lets play with some dates
train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date']))
train = train.drop('Original_Quote_Date', axis=1)

test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date']))
test = test.drop('Original_Quote_Date', axis=1)

train['Year'] = train['Date'].apply(lambda x: int(str(x)[:4]))
train['Month'] = train['Date'].apply(lambda x: int(str(x)[5:7]))
train['weekday'] = train['Date'].dt.dayofweek

test['Year'] = test['Date'].apply(lambda x: int(str(x)[:4]))
test['Month'] = test['Date'].apply(lambda x: int(str(x)[5:7]))
test['weekday'] = test['Date'].dt.dayofweek

train = train.drop('Date', axis=1)  
test = test.drop('Date', axis=1)",No,4,8.0
"from sklearn.model_selection import StratifiedKFold
clf = GridSearchCV(xgb_model, parameters, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(train[features], train[""QuoteConversion_Flag""])'",No,5,6.0
"test_probs = clf.predict_proba(test[features])[:,1]",No,5,48.0
"import os
import sys
import random
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm

from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import Adam
import tensorflow as tf

from skimage.io import imread
from skimage.transform import resize",No,5,22.0
"X_train = np.zeros((tot_num, IMG_HEIGHT, IMG_WIDTH), dtype=np.float32)
Y_train = np.zeros((tot_num, IMG_HEIGHT, IMG_WIDTH), dtype=np.float32)",No,5,21.0
"X_train_one = np.array(X_train_one)
Y_train_one = np.array(Y_train_one)
X_train_zero = np.array(X_train_zero)
Y_train_zero = np.array(Y_train_zero)",No,5,21.0
"X_train = []
Y_train = []",No,5,77.0
"X_train = np.array(X_train)
Y_train = np.array(Y_train)",No,5,21.0
IMG_CHANNELS = 1,No,5,77.0
"!pip3 install git+https://github.com/qubvel/segmentation_models
from segmentation_models import Unet

# model = Unet('densenet121',encorder_weights='imagenet',freeze_encorder=True)",No,5,87.0
"results = model.fit(X_train_ax, Y_train_ax, validation_split=0.1, batch_size=8, epochs=18)",No,5,7.0
execute_data = False,No,5,77.0
execute_metric = False,No,5,77.0
"########################################################################################################################
# ======================================================================================================================
# u_model_blocks
# ======================================================================================================================
########################################################################################################################
# needed for u_model

# standard-module imports
from keras.layers import add, concatenate, Conv2D, MaxPooling2D
from keras.layers import BatchNormalization, Lambda
from keras.layers.advanced_activations import ELU, LeakyReLU


# ======================================================================================================================
# utility blocks needed for internal performance
# ======================================================================================================================

def NConv2D(filters, kernel_size, strides=(1, 1), padding='valid', dilation_rate=1,
            activation=None, kernel_initializer='glorot_uniform'):
    """"""Create a (Normalized Conv2D followed by a chosen activation) function
    Conv2D -> BatchNormalization -> activation()

    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
    convolution)
    :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution
                        window. Can be a single integer to specify the same value for all spatial dimensions.
    :param strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the height
                    and width. Can be a single integer to specify the same value for all spatial dimensions.
                    Specifying any stride value != 1 is incompatible with specifying any dilation_rate value != 1.
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param dilation_rate: an integer or tuple/list of a single integer, specifying the dilation rate
                    to use for dilated convolution. Currently, specifying any dilation_rate value != 1
                    is incompatible with specifying any strides value != 1
    :param activation:  string, one of 'elu' or 'relu' or None (case-sensitive),
                        specifies activation function to be performed after BatchNormalization
    :param kernel_initializer: Initializer for the kernel weights matrix (see initializers in keras documentation)
    :return: a function, combined of 2D Convolution, followed by BatchNormalization across filters,
             and specified activation in that order
    """"""
    assert activation in ['relu', 'elu', None]
    # actv is a function, not a string, like activation
    actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None

    def f(_input):
        conv = Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding,
                      dilation_rate=dilation_rate, kernel_initializer=kernel_initializer)(_input)
        norm = BatchNormalization(axis=3)(conv)
        return actv()(norm)

    return f


# needed for rblock (residual block)
def _shortcut(_input, residual):
    stride_width = _input._keras_shape[1] / residual._keras_shape[1]
    stride_height = _input._keras_shape[2] / residual._keras_shape[2]
    equal_channels = residual._keras_shape[3] == _input._keras_shape[3]

    shortcut = _input
    # 1 X 1 conv if shape is different. Else identity.
    if stride_width > 1 or stride_height > 1 or not equal_channels:
        shortcut = Conv2D(filters=residual._keras_shape[3], kernel_size=(1, 1),
                          strides=(stride_width, stride_height),
                          kernel_initializer=""he_normal"", padding=""valid"")(_input)

    return add([shortcut, residual])


def rblock(inputs, filters, kernel_size, padding='valid', activation=None, scale=0.1):
    """"""Create a scaled Residual block connecting the down-path and the up-path of the u-net architecture

    Activations are scaled by a constant to prevent the network from dying. Usually is set between 0.1 and 0.3. See:
    https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output convolution filters)
    :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution
                        window. Can be a single integer to specify the same value for all spatial dimensions.
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param activation:  string, one of 'elu' or 'relu' or None (case-sensitive),
                        specifies activation function to use everywhere in the block
    :param scale: scaling factor preventing the network from dying out
    :return: 4D tensor (samples, rows, cols, channels) output of a residual block, given inputs
    """"""
    assert activation in ['relu', 'elu', None]
    # actv is a function, not a string, like activation
    actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None

    residual = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding)(inputs)
    residual = BatchNormalization(axis=3)(residual)
    residual = Lambda(lambda x: x * scale)(residual)
    res = _shortcut(inputs, residual)
    return actv()(res)


# ======================================================================================================================
# information blocks
# ======================================================================================================================

def convolution_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None,
                      version='normalized', pars={}, allowed_pars={}):
    """"""Create a version of a convolution block.

    Versions: with and without batch-normalization after convolutions.

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution
                        window. Can be a single integer to specify the same value for all spatial dimensions.
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param activation: string, specifies activation function to use everywhere in the block
    :param version: version of the convolution block, one of 'not_normalized', 'normalized' (case sensitive)
    :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters
    :return: 4D tensor (samples, rows, cols, channels) output of a convolution block, given inputs
    """"""
    assert activation in ['relu', 'elu', None]

    # checking that the allowed version names did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('information_block').get('convolution').get('simple') == ['not_normalized',
                                                                                          'normalized']
    # keep version argument if need to use without PARS
    assert version in ['not_normalized', 'normalized']
    # setting the version from pars
    if pars.get('information_block').get('convolution').get('simple') is not None:
        version = pars.get('information_block').get('convolution').get('simple')

    if version == 'normalized':
        conv1 = NConv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(inputs)
        return NConv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv1)
    else:
        conv1 = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(inputs)
        return Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv1)


def dilated_convolution_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None,
                              version='normalized', pars={}, allowed_pars={}):
    """"""Create a version of a dilated-convolution block.

    Versions: with and without batch-normalization after dilated convolutions.

    See more about dilated convolutions:
    https://towardsdatascience.com/review-dilated-convolution-semantic-segmentation-9d5a5bd768f5

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution
                        window. Can be a single integer to specify the same value for all spatial dimensions.
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param activation: string, specifies activation function to use everywhere in the block
    :param version: version of the dilated-convolution block, one of 'not_normalized', 'normalized' (case sensitive)
    :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters
    :return: 4D tensor (samples, rows, cols, channels) output of a dilated-convolution block, given inputs
    """"""
    assert activation in ['relu', 'elu', None]

    # checking that the allowed version names did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('information_block').get('convolution').get('dilated') == ['not_normalized',
                                                                                           'normalized']
    # keep version argument if need to use without PARS
    assert version in ['not_normalized', 'normalized']
    # setting the version from pars
    if pars.get('information_block').get('convolution') is not None:
        version = pars.get('information_block').get('convolution')

    if version == 'normalized':
        conv1 = NConv2D(filters=filters, kernel_size=kernel_size, padding=padding,
                        dilation_rate=2, activation=activation)(inputs)
        return NConv2D(filters=filters, kernel_size=kernel_size, padding=padding,
                       dilation_rate=1, activation=activation)(conv1)
    else:
        conv1 = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding,
                       dilation_rate=2, activation=activation)(inputs)
        return Conv2D(filters=filters, kernel_size=kernel_size, padding=padding,
                      dilation_rate=1, activation=activation)(conv1)


def inception_block_v1(inputs, filters, activation=None, version='b', pars={}, allowed_pars={}):
    """"""Create a version of v1 inception block described in:
    https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202

    Create an inception block described in v1, sections 'a' (for naive version), or 'b' (with dimension reduction)
    Each version has 4 verticals in their structure. See the link above.

    For all versions, verticals 1 and 2 of the block start with 2D convolution, which:
        reduces the number of input filters to next convolutions (to make computation cheaper)
        uses (1, 1) kernels, no Normalization
        is NOT normalized
        is followed by specified activation
    For all versions, verticals 1, 2, 3:
        the final convolution layer is not normalised and not activated since it will be dene after concatenation
    Vertical 4 is just a Conv2D. Its gets normalized and activated after being concatenated with
        outputs of other verticals.
    The concatenated output of the verticals is normalised and then activated with a given activation

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
    convolution).
    :param activation: string, specifies activation function to use everywhere in the block
    :param version: version of inception block, one of 'a', 'b' (case sensitive)
    :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters
    :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs
    """"""

    assert filters % 16 == 0

    # checking that the allowed version names did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('information_block').get('inception').get('v1') == ['a', 'b']
    # keep version argument if need to use without PARS
    assert version in ['a', 'b']
    # setting the version from pars
    if pars.get('information_block').get('inception').get('v1') is not None:
        version = pars.get('information_block').get('inception').get('v1')

    assert activation in ['relu', 'elu', None]
    # actv is a function, not a string, like activation
    actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None

    # vertical 1
    if version == 'a':
        c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), padding='same', kernel_initializer='he_normal')(inputs)
    else:
        c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same',
                      activation=activation, kernel_initializer='he_normal')(inputs)
        c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), padding='same', kernel_initializer='he_normal')(c1_1)

    # vertical 2
    if version == 'a':
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(inputs)
    else:
        c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same',
                      activation=activation, kernel_initializer='he_normal')(inputs)
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(c2_1)

    # vertical 3
    p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs)
    if version == 'b':
        c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1)
    else:
        c3 = p3_1

    # vertical 4
    c4_1 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs)
    c4 = c4_1

    # concatenating verticals together, normalizing and applying activation
    result = concatenate([c1, c2, c3, c4], axis=3)
    result = BatchNormalization(axis=3)(result)
    result = actv()(result)
    return result


def inception_block_v2(inputs, filters, activation=None, version='b', pars={}, allowed_pars={}):
    """"""Create a version of v1 inception block described in:
    https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202

    Create an inception block described in v2, sections 'a', 'b', or 'c'
    Each version has 4 verticals in their structure. See the link above.

    For all versions, verticals 1 and 2 of the block start with 2D convolution, which:
        reduces the number of input filters to next convolutions (to make computation cheaper)
        uses (1, 1) kernels, no Normalization
        is NOT normalized
        is followed by specified activation
    For all versions, verticals 1, 2, 3:
        the middle convolutions use NConv2D with given activation, see its docstring
        the final convolution layer is not normalised and not activated since it will be dene after concatenation
    Vertical 4 is just a Conv2D. Its gets normalized and activated after being concatenated with
        outputs of other verticals.
    The concatenated output of the verticals is normalised and then activated with a given activation

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param activation: string, specifies activation function to use everywhere in the block
    :param version: version of inception block, one of 'a', 'b', 'c' (case sensitive)
    :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters
    :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs
    """"""
    assert filters % 16 == 0

    # checking that the allowed version names did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('information_block').get('inception').get('v2') == ['a', 'b', 'c']
    # keep version argument if need to use without PARS
    assert version in ['a', 'b', 'c']
    # setting the version from pars
    if pars.get('information_block').get('inception').get('v2') is not None:
        version = pars.get('information_block').get('inception').get('v2')

    assert activation in ['relu', 'elu', None]
    # actv is a function, not a string, like activation
    actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None

    # vertical 1
    c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same',
                  activation=activation, kernel_initializer='he_normal')(inputs)
    if version == 'a':
        c1_2 = NConv2D(filters=filters // 8, kernel_size=3, padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_1)
        c1 = Conv2D(filters=filters // 8, kernel_size=3, padding='same', kernel_initializer='he_normal')(c1_2)
    elif version == 'b':
        c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_1)
        c1_3 = NConv2D(filters=filters // 8, kernel_size=(3, 1), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_2)
        c1_4 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_3)
        c1 = Conv2D(filters=filters // 8, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c1_4)
    else:
        c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_1)
        c1_3 = NConv2D(filters=filters // 8, kernel_size=3, padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_2)
        c1_41 = Conv2D(filters=filters // 8, kernel_size=(1, 3), padding='same', kernel_initializer='he_normal')(c1_3)
        c1_42 = Conv2D(filters=filters // 8, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c1_3)
        c1 = concatenate([c1_41, c1_42], axis=3)

    # vertical 2
    c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same',
                  activation=activation, kernel_initializer='he_normal')(inputs)
    if version == 'a':
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(c2_1)
    elif version == 'b':
        c2_2 = NConv2D(filters=filters // 2, kernel_size=(1, 3), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c2_1)
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c2_2)
    else:
        c2_21 = Conv2D(filters=filters // 2, kernel_size=(1, 3), padding='same', kernel_initializer='he_normal')(c2_1)
        c2_22 = Conv2D(filters=filters // 2, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c2_1)
        c2 = concatenate([c2_21, c2_22], axis=3)

    # vertical 3
    p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs)
    c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1)

    # vertical 4
    c4 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs)

    # concatenating verticals together, normalizing and applying activation
    result = concatenate([c1, c2, c3, c4], axis=3)
    result = BatchNormalization(axis=3)(result)
    result = actv()(result)
    return result


def inception_block_et(inputs, filters, activation='relu', version='b', pars={}, allowed_pars={}):
    """"""Create an inception block with 2 options.
    For intuition read, parts v1 and v2:
    https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202

    Each version/option has 4 verticals in their structure. See the link above.
    Default option: version='b'
        Create an inception block close to one described in v2, but keeps 5 as a factor for some convolutions
    Alternative option: version='a'
        Create an inception block described in v1, section


    Function author Edward Tyantov. That's why the name: inception_block_et.
    My modifications

        use version='a' instead of split=False
        use version='b' instead of split=True

        change default to version='b', aka split=True

        swap: Conv2D -> BatchNormalization -> activation
        to:   NConv2D blocks. See NConv2D documentation for them.

        swap: Conv2D -> activation
        to:   Conv2D -> Conv2D(activation=activation)

        change the order of the verticals to coincide with v2_paper notation

        change names of the outputs of the block verticals to c1, c2, c3, c4

        use 'result' instead of 'res' to avoid confusion with residuals

    :param inputs: Input 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param activation: activation function to use everywhere in the block
    :param version: version of inception block
    :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters
    :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs
    """"""
    assert filters % 16 == 0

    # checking that the allowed version names did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('information_block').get('inception').get('et') == ['a', 'b']
    # keep version argument if need to use without PARS
    assert version in ['a', 'b']
    # setting the version from pars
    if pars.get('information_block').get('inception').get('et') is not None:
        version = pars.get('information_block').get('inception').get('et')

    assert activation in ['relu', 'elu', None]
    # actv is a function, not a string, like activation
    actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None

    # vertical 1
    c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same',
                  activation=activation, kernel_initializer='he_normal')(inputs)
    if version == 'b':
        c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 5), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c1_1)
        c1 = Conv2D(filters=filters // 8, kernel_size=(5, 1), kernel_initializer='he_normal', padding='same')(c1_2)
    else:
        c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), kernel_initializer='he_normal', padding='same')(c1_1)

    # vertical 2
    c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same',
                  activation=activation, kernel_initializer='he_normal')(inputs)
    if version == 'b':
        c2_2 = NConv2D(filters=filters // 2, kernel_size=(1, 3), padding='same',
                       activation=activation, kernel_initializer='he_normal')(c2_1)
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 1), kernel_initializer='he_normal', padding='same')(c2_2)
    else:
        c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), kernel_initializer='he_normal', padding='same')(c2_1)

    # vertical 3
    p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs)
    c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1)

    # vertical 4
    c4 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs)

    # concatenating verticals together, normalizing and applying activation
    result = concatenate([c1, c2, c3, c4], axis=3)
    result = BatchNormalization(axis=3)(result)
    result = actv()(result)
    return result


# ======================================================================================================================
# Combining blocks, allowing to use different blocks from before
# ======================================================================================================================

def pooling_block(inputs, filters, kernel_size=(3, 3), strides=(2, 2), padding='same', activation=None,
                  pool_size=(2, 2), trainable=True, pars={}, allowed_pars={}):
    """"""Function returning the output of one of the pooling blocks.

    Allows not to make different versions of the u-net in terms of how pooling operation is performed:
        1) trainable (default): through NConv2D custom function, see its documentation
        2) non-trainable (alternative): through MaxPooling operation

    To get the expected behaviour when changing 'trainable' assert strides == pool_size

    Parameters starting with p_ are only to be used for (trainable=False) MaxPooling2D
    Parameters starting with c_ are only to be used for (trainable=True) MaxPooling2D

    :param inputs: 4D tensor (samples, rows, cols, channels)
    :param filters:     NConv2D argument, filters
    :param kernel_size: NConv2D argument, kernel_size
    :param strides:     NConv2D argument, strides
    :param padding:     NConv2D/MaxPooling2D argument, padding
    :param activation:  NConv2D argument, activation
    :param pool_size:   MaxPooling2D argument, pool_size

    :param trainable: boolean specifying the version of a pooling block with default behaviour
        trainable=True: NConv2D(inputs._keras_shape[3], kernel_size=kernel_size, strides=strides, padding=padding)(
        inputs)
        trainable=False: MaxPooling2D(pool_size=pool_size)(inputs)
    :param pars: dictionary of parameters passed to u-net, determines the version of the block
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters

    :return: 4D tensor (samples, rows, cols, channels) output of a pooling block
    """"""
    # checking that the allowed trainable parameters did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('pooling_block').get('trainable') == [True, False]
    # keep trainable argument if need to use without PARS
    assert trainable in [True, False]

    # setting the version from pars
    if pars.get('pooling_block').get('trainable') is not None:
        trainable = pars.get('pooling_block').get('trainable')

    # returning block's output
    if trainable:
        return NConv2D(filters=filters, kernel_size=kernel_size, strides=strides,
                       padding=padding, activation=activation)(inputs)
    else:
        return MaxPooling2D(pool_size=pool_size, padding=padding)(inputs)


def information_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None,
                      block='inception', block_type='v2', version='b', pars={}, allowed_pars={}):
    """"""Function returning the output of one of the information blocks.

    :param inputs: 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution
                        window. Can be a single integer to specify the same value for all spatial dimensions.
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param activation: string, specifies activation function to use everywhere in the block

    Next 3 parameters are there to be able to leave 'pars' and 'allowed_pars' empty
    :param block:       one of 'inception' or 'convolution' (case-sensitive)
    :param block_type:  if block == 'inception', one of 'v1', 'v2', 'et' (case-sensitive)
                        if block == 'convolution': one of 'simple', 'dilated' (case-sensitive)
    :param version:     version of a block to use

    :param pars: dictionary of parameters passed to u-net, determines the version of the block
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters

    :return: 4D tensor (samples, rows, cols, channels) output of a information block
    """"""
    # getting which block, block_type, version to use as the information block
    if pars.get('information_block') is not None:
        block = list(pars.get('information_block').keys())[0]
        block_type = list(pars.get('information_block').get(block).keys())[0]
        version = pars.get('information_block').get(block).get(block_type)

    # inception block
    if block == 'inception':
        if block_type == 'v1':
            return inception_block_v1(inputs=inputs, filters=filters, activation=activation,
                                      version=version, pars=pars, allowed_pars=allowed_pars)
        elif block_type == 'v2':
            return inception_block_v2(inputs=inputs, filters=filters, activation=activation,
                                      version=version, pars=pars, allowed_pars=allowed_pars)
        else:
            return inception_block_et(inputs=inputs, filters=filters, activation=activation,
                                      version=version, pars=pars, allowed_pars=allowed_pars)
    # convolution block
    else:
        if block_type == 'simple':
            return convolution_block(inputs=inputs, filters=filters, kernel_size=kernel_size,
                                     padding=padding, activation=activation,
                                     version=version, pars=pars, allowed_pars=allowed_pars)
        else:
            return dilated_convolution_block(inputs=inputs, filters=filters,
                                             kernel_size=kernel_size, padding=padding,
                                             activation=activation, version=version,
                                             pars=pars, allowed_pars=allowed_pars)


def connection_block(inputs, filters, padding='valid', activation=None,
                     version='residual', pars={}, allowed_pars={}):
    """"""Function returning the output of one of the connection block.

    :param inputs: 4D tensor (samples, rows, cols, channels)
    :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the
                    convolution).
    :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D
    :param activation:  string, one of 'elu' or 'relu' or None (case-sensitive),
                        specifies activation function to use everywhere in the block

    Version parameter is there to be able to leave 'pars' and 'allowed_pars' empty
    :param version: one of 'not_residual' or 'residual', version of a block to use

    :param pars: dictionary of parameters passed to u-net, determines the version of the block
    :param allowed_pars: dictionary of all allowed to be passed to u-net parameters

    :return: 4D tensor (samples, rows, cols, channels) output of a connection block
    """"""
    # checking that the allowed trainable parameters did not change in ALLOWED_PARS
    if allowed_pars != {}:
        assert allowed_pars.get('connection_block') == ['not_residual', 'residual']
    # keep trainable argument if need to use without PARS
    assert version in ['not_residual', 'residual']
    # setting the version from pars
    if pars.get('connection_block') is not None:
        version = pars.get('connection_block')

    if version == 'residual':
        return rblock(inputs=inputs, filters=32, kernel_size=(1, 1), padding='same', activation=activation)
    else:
        return Conv2D(filters=filters, kernel_size=(2, 2), padding=padding, kernel_initializer='he_normal')(inputs)
'",Yes,5,84.0
execute_u_model = False,No,5,77.0
"########################################################################################################################
# ======================================================================================================================
# u_model
# ======================================================================================================================
########################################################################################################################
# needed for train

# standard-module imports
import numpy as np
from keras.layers import Input, concatenate, Conv2D, UpSampling2D, Dense
from keras.layers import Dropout, Flatten
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K

# # separate-module imports
# from metric import dice_coef, dice_coef_loss
# from u_model_blocks import pooling_block, connection_block, information_block
# from configuration import ALLOWED_PARS, PARS


IMG_ROWS, IMG_COLS = 80, 112
K.set_image_data_format('channels_last')  # (number of images, rows per image, cols per image, channels)


# ======================================================================================================================
# U-net with Inception blocks, Normalised 2D Convolutions instead of Maxpooling
# ======================================================================================================================

def get_unet_customised(optimizer, pars=PARS, allowed_pars=ALLOWED_PARS):
    """"""
    Creating and compiling the U-net

    This version is fully customisable by choosing pars argument

    :param optimizer: specifies the optimiser for the u-net, e.g. Adam, RMSProp, etc.
    :param pars: optional, dictionary of parameters passed to customise the U-net
    :param allowed_pars: optional, dictionary of parameters allowed to be passed to customise the U-net
    :return: compiled u-net, Keras.Model object
    """"""

    # string, activation function
    activation = pars.get('activation')

    # input
    inputs = Input((IMG_ROWS, IMG_COLS, 1), name='main_input')
    print('inputs:', inputs._keras_shape)

    #
    # down the U-net
    #

    conv1 = information_block(inputs, 32, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv1', conv1._keras_shape)
    pool1 = pooling_block(inputs=conv1, filters=32, activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('pool1', pool1._keras_shape)
    pool1 = Dropout(0.5)(pool1)
    print('pool1', pool1._keras_shape)

    conv2 = information_block(pool1, 64, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv2', conv2._keras_shape)
    pool2 = pooling_block(inputs=conv2, filters=64, activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('pool2', pool2._keras_shape)
    pool2 = Dropout(0.5)(pool2)
    print('pool2', pool2._keras_shape)

    conv3 = information_block(pool2, 128, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv3', conv3._keras_shape)
    pool3 = pooling_block(inputs=conv3, filters=128, activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('pool3', pool3._keras_shape)
    pool3 = Dropout(0.5)(pool3)
    print('pool3', pool3._keras_shape)

    conv4 = information_block(pool3, 256, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv4', conv4._keras_shape)
    pool4 = pooling_block(inputs=conv4, filters=256, activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('pool4', pool4._keras_shape)
    pool4 = Dropout(0.5)(pool4)
    print('pool4', pool4._keras_shape)

    #
    # bottom level of the U-net
    #
    conv5 = information_block(pool4, 512, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv5', conv5._keras_shape)
    conv5 = Dropout(0.5)(conv5)
    print('conv5', conv5._keras_shape)

    #
    # auxiliary output for predicting probability of nerve presence
    #
    if pars['outputs'] == 2:
        pre = Conv2D(1, kernel_size=(1, 1), kernel_initializer='he_normal', activation='sigmoid')(conv5)
        pre = Flatten()(pre)
        aux_out = Dense(1, activation='sigmoid', name='aux_output')(pre)

    #
    # up the U-net
    #

    after_conv4 = connection_block(conv4, 256, padding='same', activation=activation,
                                   pars=pars, allowed_pars=allowed_pars)
    print('after_conv4', after_conv4._keras_shape)
    up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), after_conv4], axis=3)
    conv6 = information_block(up6, 256, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv6', conv6._keras_shape)
    conv6 = Dropout(0.5)(conv6)
    print('conv6', conv6._keras_shape)

    after_conv3 = connection_block(conv3, 128, padding='same', activation=activation,
                                   pars=pars, allowed_pars=allowed_pars)
    print('after_conv3', after_conv3._keras_shape)
    up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), after_conv3], axis=3)
    conv7 = information_block(up7, 128, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv7', conv7._keras_shape)
    conv7 = Dropout(0.5)(conv7)
    print('conv7', conv7._keras_shape)

    after_conv2 = connection_block(conv2, 64, padding='same', activation=activation, pars=pars,
                                   allowed_pars=allowed_pars)
    print('after_conv2', after_conv2._keras_shape)
    up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), after_conv2], axis=3)
    conv8 = information_block(up8, 64, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv8', conv8._keras_shape)
    conv8 = Dropout(0.5)(conv8)
    print('conv8', conv8._keras_shape)

    after_conv1 = connection_block(conv1, 32, padding='same', activation=activation,
                                   pars=pars, allowed_pars=allowed_pars)
    print('after_conv1', after_conv1._keras_shape)
    up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), after_conv1], axis=3)
    conv9 = information_block(up9, 32, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars)
    print('conv9', conv9._keras_shape)
    conv9 = Dropout(0.5)(conv9)
    print('conv9', conv9._keras_shape)

    # main output
    conv10 = Conv2D(1, kernel_size=(1, 1), kernel_initializer='he_normal', activation='sigmoid', name='main_output')(
        conv9)
    print('conv10', conv10._keras_shape)

    # creating a model
    # compiling the model
    if pars['outputs'] == 1:
        model = Model(inputs=inputs, outputs=conv10)
        model.compile(optimizer=optimizer,
                      loss={'main_output': dice_coef_loss},
                      metrics={'main_output': dice_coef})
    else:
        model = Model(inputs=inputs, outputs=[conv10, aux_out])
        model.compile(optimizer=optimizer,
                      loss={'main_output': dice_coef_loss, 'aux_output': 'binary_crossentropy'},
                      metrics={'main_output': dice_coef, 'aux_output': 'acc'},
                      loss_weights={'main_output': 1., 'aux_output': 0.5})

    return model


# ----------------------------------------------------------------------------------------------------------------------

# get_unet() allows to try other versions of the u-net, if more are specified
get_unet = get_unet_customised

if __name__ == '__main__' and (execute_u_model==True or execute_all==True):
    # test the u-net without training

    img_rows = IMG_ROWS
    img_cols = IMG_COLS

    # to check that model works without training, any kind of optimiser can be used
    model = get_unet(Adam(lr=1e-5), pars=PARS)

    x = np.random.random((1, img_rows, img_cols, 1))
    result = model.predict(x, 1)
    print(result)
    print('params', model.count_params())
    print('layer num', len(model.layers))'",Yes,5,53.0
execute_train = False,No,5,77.0
"########################################################################################################################
# ======================================================================================================================
# train
# ======================================================================================================================
########################################################################################################################

# standard-module imports
import numpy as np
from skimage.transform import resize
from keras.callbacks import ModelCheckpoint, EarlyStopping

# # separate-module imports
# from u_model import get_unet, IMG_COLS as img_cols, IMG_ROWS as img_rows
# from data import load_train_data, load_test_data, load_nerve_presence
# from configuration import PARS, OPTIMIZER


def preprocess(imgs, to_rows=None, to_cols=None):
    """"""Resize all images in a 4D tensor of images of the shape (samples, rows, cols, channels).

    :param imgs: a 4D tensor of images of the shape (samples, rows, cols, channels)
    :param to_rows: new number of rows for images to be resized to
    :param to_cols: new number of rows for images to be resized to
    :return: a 4D tensor of images of the shape (samples, to_rows, to_cols, channels)
    """"""
    if to_rows is None or to_cols is None:
        to_rows = img_rows
        to_cols = img_cols

    print(imgs.shape)
    imgs_p = np.ndarray((imgs.shape[0], to_rows, to_cols, imgs.shape[3]), dtype=np.uint8)
    for i in range(imgs.shape[0]):
        imgs_p[i, :, :, 0] = resize(imgs[i, :, :, 0], (to_rows, to_cols), preserve_range=True)
    return imgs_p


def train_and_predict():
    print('-' * 30)
    print('Loading and preprocessing train data...')
    print('-' * 30)
    imgs_train, imgs_mask_train = load_train_data()
    imgs_present = load_nerve_presence()

    imgs_train = preprocess(imgs_train)
    imgs_mask_train = preprocess(imgs_mask_train)

    # centering and standardising the images
    imgs_train = imgs_train.astype('float32')
    mean = np.mean(imgs_train)
    std = np.std(imgs_train)
    imgs_train -= mean
    imgs_train /= std

    imgs_mask_train = imgs_mask_train.astype('float32')
    imgs_mask_train /= 255.  # scale masks to be in {0, 1} instead of {0, 255}

    print('-' * 30)
    print('Creating and compiling model...')
    print('-' * 30)

    # load model - the Learning rate scheduler choice is most important here
    model = get_unet(optimizer=OPTIMIZER, pars=PARS)

    model_checkpoint = ModelCheckpoint('weights.h5', monitor='val_loss', save_best_only=True)
    early_stopping = EarlyStopping(patience=5, verbose=1)

    print('-' * 30)
    print('Fitting model...')
    print('-' * 30)

    if PARS['outputs'] == 1:
        imgs_labels = imgs_mask_train
    else:
        imgs_labels = [imgs_mask_train, imgs_present]

    model.fit(imgs_train, imgs_labels,
              batch_size=128, epochs=50,
              verbose=1, shuffle=True,
              validation_split=0.2,
              callbacks=[model_checkpoint, early_stopping])

    print('-' * 30)
    print('Loading and preprocessing test data...')
    print('-' * 30)
    imgs_test = load_test_data()
    imgs_test = preprocess(imgs_test)

    imgs_test = imgs_test.astype('float32')
    imgs_test -= mean
    imgs_test /= std

    print('-' * 30)
    print('Loading saved weights...')
    print('-' * 30)
    model.load_weights('weights.h5')

    print('-' * 30)
    print('Predicting masks on test data...')
    print('-' * 30)

    imgs_mask_test = model.predict(imgs_test, verbose=1)

    if PARS['outputs'] == 1:
        np.save('imgs_mask_test.npy', imgs_mask_test)
    else:
        np.save('imgs_mask_test.npy', imgs_mask_test[0])
        np.save('imgs_mask_test_present.npy', imgs_mask_test[1])
        

# --------------------------------------------------------------------------------------------------------------------

if __name__ == '__main__' and (execute_train==True or execute_all==True):
    train_and_predict()'",Yes,5,53.0
execute_submission = False,No,5,77.0
"import pandas as pd
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer",No,5,22.0
"data = pd.read_csv(""../input/train.tsv"",delimiter='\\t')
data.shape'",No,4,45.0
"adjectives = []
for i in range(0,data.shape[0]):
    pos_tagged = pos_tag(word_tokenize(data.iloc[i,2]))
    string = """"
    for j in range(0,len(pos_tagged)):
        if pos_tagged[j][1] in (""JJ"",""JJR"", ""JJS"", ""RB"", ""RBR"", ""RBS""):
            string = string + "" "" + pos_tagged[j][0]
    adjectives.append(string)",No,5,77.0
data['Adjective Review'] = adjectives,No,5,8.0
"data = data.drop(""PhraseId"",axis=1)
data = data.drop(""SentenceId"",axis=1)
data = data.drop(""Phrase"",axis=1)
print(data.head())",No,4,10.0
"predictors = data['Adjective Review']
predictors.shape",No,5,58.0
"response = data[""Sentiment""]
response.shape",No,5,58.0
"tv = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True,max_features=1000)
tv_features = tv.fit_transform(predictors)",No,5,8.0
print(tv_features.shape),No,5,58.0
"train_predictors, test_predictors, train_response, test_response = train_test_split(tv_features, response, random_state = 0)",No,5,13.0
"print(train_predictors.shape)
print(test_predictors.shape)",No,5,58.0
"from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_features = 10).fit(train_predictors, train_response)",No,5,7.0
predicted_test_response = model.predict(test_predictors),No,5,48.0
"accuracy_score(test_response, predicted_test_response)",No,5,49.0
"test = pd.read_csv(""../input/test.tsv"",delimiter='\\t')
test.shape'",No,4,45.0
test['Adjective Review'] = test_adjectives,No,5,8.0
"test = test.drop(""SentenceId"",axis=1)
test = test.drop(""Phrase"",axis=1)
print(test.head())",No,4,10.0
"test_predictors = test['Adjective Review']
test_predictors.shape",No,5,58.0
"test_tv = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True,max_features=1000)
test_tv_features = tv.fit_transform(test_predictors)",No,5,8.0
test_response = model.predict(test_tv_features),No,5,48.0
len(test_response),No,5,58.0
test['Sentiment'] = test_response,No,5,8.0
"test.head()
test.shape",No,4,41.0
"test = test.drop(""Adjective Review"",axis=1)",No,5,10.0
"test.to_csv(""Submission.csv"", sep=',',index=False)'",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dropout, Dense,Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"test_data = pd.read_csv(""../input/test.tsv"",delimiter='\\t')
train_data = pd.read_csv(""../input/train.tsv"",delimiter=""\\t"")'",No,5,45.0
from keras.utils import np_utils,No,5,22.0
"model.fit(X, Y, epochs=10, validation_split=0.2)",No,5,7.0
Y_test = model.predict(X_test),No,5,48.0
"submission = pd.DataFrame({'PhraseId' : test_data[""PhraseId""],
                                'Sentiment' : Y_test})'",No,5,12.0
"submission.to_csv(""submission.csv"", index=False)",No,5,25.0
"b""train_df=pd.read_csv('../input/train.tsv',sep='\\t')\ntest_df=pd.read_csv('../input/test.tsv',sep='\\t')""",No,5,45.0
"train_df['token']=train_df.apply(tokenizer,axis=1)
test_df['token']=test_df.apply(tokenizer,axis=1)
",No,5,8.0
"train_df['lemma']=train_df.apply(lemmatize,axis=1)
test_df['lemma']=test_df.apply(lemmatize,axis=1)",No,5,78.0
"#Loading the Data
train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
",No,5,45.0
train.signup_method.unique(),No,5,57.0
train.affiliate_channel.unique(),No,5,57.0
train.affiliate_provider.unique(),No,5,57.0
train.first_affiliate_tracked.unique(),No,5,57.0
train.first_device_type.unique(),No,5,57.0
train.first_browser.unique(),No,5,57.0
train.signup_app.unique(),No,5,57.0
countries.sort_values(by='distance_km'),No,5,9.0
"a=train['country_destination'].value_counts() 
a=a.drop(['NDF','other'],axis=0) 
df_value_counts = pd.DataFrame(a)
df_value_counts = a.reset_index() 
df_value_counts.columns = ['country_destination', 'value_counts'] # change column names 
mapdata= pd.merge(df_value_counts, countries,how= 'inner' , on='country_destination')
mapdata['poppercent']=mapdata['value_counts']/mapdata['value_counts'].sum() 
mapdata['text']=['United States', 'France', 'Italy','United Kingdom', 'Spanish','Canada' ,'German', 'Dutch','Australia', 'Brazil' ] ",Yes,3,12.0
"# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster
#empty map
world_map= folium.Map(tiles=""cartodbpositron"")
marker_cluster = MarkerCluster().add_to(world_map)
#for each coordinate, create circlemarker of user percent
for i in range(len(mapdata)):
        lat = mapdata.iloc[i]['lat_destination']
        long = mapdata.iloc[i]['lng_destination']
        radius=5
        popup_text = """"""Country : {}<br>
                    %of Users : {}<br>""""""
        popup_text = popup_text.format(mapdata.iloc[i]['country_destination'],
                                   mapdata.iloc[i]['poppercent']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
#show the map
world_map'",Yes,5,33.0
"a=train.groupby('country_destination').count().sort_values(by='id',ascending=True)
a.id
",Yes,5,9.0
countries.dtypes,No,5,70.0
"age_gender.drop(age_gender.index[0],inplace=True)
age_gender",No,5,10.0
age_gender.country_destination.unique(),No,5,57.0
age_gender.age_bucket.unique(),No,5,57.0
"agebtw0n19= age_gender[(age_gender.age_bucket=='0-4') | (age_gender.age_bucket=='5-9') | (age_gender.age_bucket=='10-14')|(age_gender.age_bucket=='15-19')]
agebtw49n34= age_gender[(age_gender.age_bucket=='45-49') | (age_gender.age_bucket=='40-44') | (age_gender.age_bucket=='35-39') | (age_gender.age_bucket=='30-34')]
agebtw99n50= age_gender[(age_gender.age_bucket=='95-99') | (age_gender.age_bucket=='90-94') | (age_gender.age_bucket=='85-89') | (age_gender.age_bucket=='80-84') | (age_gender.age_bucket=='75-79') | (age_gender.age_bucket=='70-74') | (age_gender.age_bucket=='70-74') | (age_gender.age_bucket=='65-69') | (age_gender.age_bucket=='60-64') | (age_gender.age_bucket=='55-59') | (age_gender.age_bucket=='50-54') ]
agebtw20n29= age_gender[(age_gender.age_bucket=='25-29') | (age_gender.age_bucket=='20-24')]
",No,4,13.0
"import seaborn as sns
ax = sns.boxplot(x=agebtw20n29[""population_in_thousands""])",No,5,33.0
"import seaborn as sns
ax = sns.boxplot(x=agebtw99n50[""population_in_thousands""])",No,5,33.0
"plt.figure(figsize=(20,5))
import seaborn as sns
age_gender.sort_values(""age_bucket"", ascending=False,inplace=True)
colors=sns.color_palette()
sns.stripplot(x=""age_bucket"",y=""population_in_thousands"",data=age_gender,jitter=True,hue='country_destination',palette='pastel')'",No,5,81.0
"actionanditsdetail=pd.crosstab(age_gender.age_bucket, age_gender.country_destination,margins=False)
from scipy.stats import chi2_contingency 

# defining the table 

stat, p, dof, expected = chi2_contingency(actionanditsdetail) 

# interpret p-value 
alpha = 0.05
print(""p value is "" + str(p)) 
if p <= alpha: 
\tprint('Dependent (reject H0)') 
else: 
\tprint('Independent (H0 holds true)')'",Yes,5,47.0
"actionanditsdetail=pd.crosstab(age_gender.country_destination, age_gender.gender,margins=False)
from scipy.stats import chi2_contingency 

# defining the table 

stat, p, dof, expected = chi2_contingency(actionanditsdetail) 

# interpret p-value 
alpha = 0.05
print(""p value is "" + str(p)) 
if p <= alpha: 
\tprint('Dependent (reject H0)') 
else: 
\tprint('Independent (H0 holds true)')
'",Yes,5,47.0
"sessions=sessions.sort_values('user_id')
sessions.rename(columns={""user_id"": ""id""},inplace=True)
sessions.sort_values(by='id',inplace=True)
sessions = sessions[(sessions['secs_elapsed'].notnull()) & (sessions['secs_elapsed'] > 0.0) ]  
sessions[(sessions['action_detail']=='booking')]'",Yes,3,9.0
"groupedid=pd.DataFrame(sessions.groupby(['id'])['secs_elapsed'].sum())
groupedid['hour_elapsed']=groupedid['secs_elapsed'].div(3600).round(decimals=0)
groupedid.drop('secs_elapsed',axis=1,inplace=True)
groupedid.reset_index(inplace=True)
groupedid[(groupedid['id'] == '6udv3scuxe') | (groupedid['id'] == 'yxf0sm9sbw') | (groupedid['id'] == 'nttj7g9av6')] ",Yes,4,12.0
groupedid.hour_elapsed.describe(),No,5,40.0
"actionndevicetype=pd.crosstab(sessions.action, sessions.device_type,margins=False)
from scipy.stats import chi2_contingency 

# defining the table 

stat, p, dof, expected = chi2_contingency(actionndevicetype) 

# interpret p-value 
alpha = 0.05
print(""p value is "" + str(p)) 
if p <= alpha: 
\tprint('Dependent (reject H0)') 
else: 
\tprint('Independent (H0 holds true)')'",Yes,5,47.0
"import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print(train.country_destination.value_counts())
sns.countplot(train.country_destination)",Yes,3,72.0
"import pandas as pd
#Loading the Data again
train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
",Yes,5,45.0
"#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)",Yes,2,11.0
"train['first_affiliate_tracked'] = train['first_affiliate_tracked'].fillna('Unknown')
test['first_affiliate_tracked'] = test['first_affiliate_tracked'].fillna('Unknown')",No,5,17.0
"import seaborn as sns
sns.boxplot(x=train.age)",Yes,5,33.0
train.age.describe(),No,5,40.0
"b""import numpy as np\n#train=train[(train.age > 14) & (train.age < 110)]# modele sadece 14 yandan byk ve 110 yandan kkleri dahil edebilirim.\n#test=test[(test.age > 14) & (test.age < 110)]\ntrain[(train.age < 14) & (train.age > 110)]=np.nan\ntest[(test.age < 14) & (test.age > 110)]=np.nan\ntrain['age'].fillna(train['age'].mean(), inplace=True)\ntest['age'].fillna(train['age'].mean(), inplace=True)""",Yes,5,17.0
"#Converting below columns as categories for plotting in graphs
categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method',
    'signup_flow'
]

for categorical_feature in categorical_features:
    train[categorical_feature] = train[categorical_feature].astype('category')
for categorical_feature in categorical_features:
    test[categorical_feature] = test[categorical_feature].astype('category')    ",No,5,16.0
"train['date_account_created'] = pd.to_datetime(train['date_account_created'])
train['date_first_booking'] = pd.to_datetime(train['date_first_booking'])
train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format='%Y%m%d%H%M%S')
train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active']).dt.date
test['date_account_created'] = pd.to_datetime(test['date_account_created'])
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format='%Y%m%d%H%M%S')
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active']).dt.date
",No,5,16.0
"import matplotlib.pyplot as plt
import seaborn as sns
# Use seaborn style defaults and set the default figure size

df = train.groupby(['date_first_booking'])['country_destination'].count().reset_index()
df.dropna(axis=0,inplace=True)
import plotly.express as px

fig = px.line(df, x='date_first_booking', y=""country_destination"")
fig.show()
'",Yes,5,81.0
"train.drop(['date_first_booking'], axis=1,inplace=True)
test.drop(['date_first_booking'], axis=1,inplace=True)",No,5,10.0
"#date_account_created

dac = np.vstack(train.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
train['dac_year'] = dac[:,0]
train['dac_month'] = dac[:,1]
train['dac_day'] = dac[:,2]
train.drop(['date_account_created'], axis=1,inplace=True)
#timestamp_first_active
tfa = np.vstack(train.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
train['tfa_year'] = tfa[:,0]
train['tfa_month'] = tfa[:,1]
train['tfa_day'] = tfa[:,2]
train.drop(['timestamp_first_active'], axis=1,inplace=True)
#date_account_created

dac = np.vstack(test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
test['dac_year'] = dac[:,0]
test['dac_month'] = dac[:,1]
test['dac_day'] = dac[:,2]
test.drop(['date_account_created'], axis=1,inplace=True)
#timestamp_first_active
tfa = np.vstack(test.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
test['tfa_year'] = tfa[:,0]
test['tfa_month'] = tfa[:,1]
test['tfa_day'] = tfa[:,2]
test.drop(['timestamp_first_active'], axis=1,inplace=True)",No,4,78.0
"from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder() 
  
train['gender']= le.fit_transform(train['gender'])
train['signup_method']= le.fit_transform(train['signup_method']) 
train['first_affiliate_tracked']= le.fit_transform(train['first_affiliate_tracked']) 
train['signup_method']= le.fit_transform(train['signup_method']) 
train['language']= le.fit_transform(train['language'])
train['affiliate_channel']= le.fit_transform(train['affiliate_channel'])
train['affiliate_provider']= le.fit_transform(train['affiliate_provider'])
train['signup_app']= le.fit_transform(train['signup_app'])
train['first_device_type']= le.fit_transform(train['first_device_type'])
train['first_browser']= le.fit_transform(train['first_browser'])
train['signup_flow']= le.fit_transform(train['signup_flow'])",Yes,5,20.0
"le = LabelEncoder() 
  
test['gender']= le.fit_transform(test['gender'])
test['signup_method']= le.fit_transform(test['signup_method']) 
test['first_affiliate_tracked']= le.fit_transform(test['first_affiliate_tracked']) 
test['signup_method']= le.fit_transform(test['signup_method']) 
test['language']= le.fit_transform(test['language'])
test['affiliate_channel']= le.fit_transform(test['affiliate_channel'])
test['affiliate_provider']= le.fit_transform(test['affiliate_provider'])
test['signup_app']= le.fit_transform(test['signup_app'])
test['first_device_type']= le.fit_transform(test['first_device_type'])
test['first_browser']= le.fit_transform(test['first_browser'])
test['signup_flow']= le.fit_transform(test['signup_flow'])",Yes,5,20.0
"train.country_destination.replace('NDF',0,inplace=True)
train.country_destination.replace('US',1,inplace=True)
train.country_destination.replace('other',2,inplace=True)
train.country_destination.replace('FR',3,inplace=True)
train.country_destination.replace('CA',4,inplace=True)
train.country_destination.replace('GB',5,inplace=True)
train.country_destination.replace('ES',6,inplace=True)
train.country_destination.replace('IT',7,inplace=True)
train.country_destination.replace('PT',8,inplace=True)
train.country_destination.replace('NL',9,inplace=True)
train.country_destination.replace('DE',10,inplace=True)
train.country_destination.replace('AU',11,inplace=True)",No,5,20.0
"from sklearn.model_selection import train_test_split
y=train['country_destination']
X=train.drop(['country_destination','id'],axis=1)
from imblearn.combine import SMOTETomek
# transform the dataset
smotetomek = SMOTETomek(sampling_strategy='auto')

# split the dataset into train and test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.30, random_state=1,shuffle=True,stratify=y)
X_train, y_train = smotetomek.fit_resample(X_train1, y_train1)",Yes,4,13.0
"x=pd.DataFrame(X_train)
Y=pd.DataFrame(y_train)
result = pd.concat([x, Y], axis=1, join='inner')
sns.countplot(result.country_destination)",Yes,5,12.0
"target_names = ['NDF', 'US', 'other', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU']
#Classifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


rf=RandomForestClassifier()

rf.fit(X_train,y_train)

y_predrf=rf.predict(X_test1)
print(classification_report(y_test1, y_predrf, target_names=target_names))",Yes,4,7.0
"import pandas as pd
feature_imp = pd.Series(rf.feature_importances_,index=feature_names).sort_values(ascending=False)
feature_imp
",Yes,5,79.0
"import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title(""Visualizing Important Features"")
plt.legend()
plt.show()'",Yes,5,79.0
"pred_country={0:""NDF"", 1:""US"", 2:""other"", 3:""FR"", 4:""CA"", 5:""GB"", 6:""ES"", 7:""IT"", 8:""PT"", 9:""DE"", 10:""NL"", 11:""AU""}",No,3,77.0
"from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier()                  

xgb.fit(X_train,y_train)


y_predxgb=xgb.predict(X_test1)

print(classification_report(y_test1, y_predxgb, target_names=target_names))
",Yes,3,7.0
"predictionsxgb=xgb.predict(test.drop(['id'],axis=1))",No,5,48.0
"from sklearn.neural_network import MLPClassifier
#Generate prediction using Neural Net

mlp = MLPClassifier(activation='identity', solver='sgd',learning_rate='adaptive', alpha=0.0001, batch_size='auto')
mlp.fit(X_train,y_train)
predsmlp = mlp.predict(X_test1)
from sklearn import metrics
print(classification_report(y_test1, predsmlp, target_names=target_names))",Yes,3,7.0
"from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'solver': ['lbfgs','sgd'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
mlpgridsearch = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)
mlpgridsearch.fit(X_train,y_train)
predsgridmlp = mlpgridsearch.predict(X_test1)
from sklearn import metrics
print(classification_report(y_test1, predsgridmlp, target_names=target_names))",Yes,4,6.0
"from sklearn.naive_bayes import ComplementNB cnb = ComplementNB() cnb.fit(X_train, y_train) y_predcnb=cnb.predict(X_test1)

from sklearn import metrics

print(classification_report(y_test1, y_predcnb, target_names=target_names))",Yes,4,49.0
"import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from xgboost.sklearn import XGBClassifier

#Loading the Data again train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') train['first_affiliate_tracked'] = train['first_affiliate_tracked'].fillna('Unknown') test['first_affiliate_tracked'] = test['first_affiliate_tracked'].fillna('Unknown') train['date_account_created'] = pd.to_datetime(train['date_account_created']) train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format='%Y%m%d%H%M%S') train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active']).dt.date test['date_account_created'] = pd.to_datetime(test['date_account_created']) test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format='%Y%m%d%H%M%S') test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active']).dt.date train.drop(['date_first_booking'], axis=1,inplace=True) test.drop(['date_first_booking'], axis=1,inplace=True)

#date_account_created

dac = np.vstack(train.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['dac_year'] = dac[:,0] train['dac_month'] = dac[:,1] train['dac_day'] = dac[:,2] train.drop(['date_account_created'], axis=1,inplace=True)

#timestamp_first_active tfa = np.vstack(train.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['tfa_year'] = tfa[:,0] train['tfa_month'] = tfa[:,1] train['tfa_day'] = tfa[:,2] train.drop(['timestamp_first_active'], axis=1,inplace=True)

#date_account_created

dac = np.vstack(test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['dac_year'] = dac[:,0] test['dac_month'] = dac[:,1] test['dac_day'] = dac[:,2] test.drop(['date_account_created'], axis=1,inplace=True)

#timestamp_first_active tfa = np.vstack(test.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['tfa_year'] = tfa[:,0] test['tfa_month'] = tfa[:,1] test['tfa_day'] = tfa[:,2] test.drop(['timestamp_first_active'], axis=1,inplace=True) import numpy as np train[(train.age < 14) & (train.age > 110)]=np.nan train['age'].fillna(train['age'].mean(), inplace=True)

test[(test.age < 14) & (test.age > 110)]=np.nan test['age'].fillna(train['age'].mean(), inplace=True)

#Converting below columns as categories for plotting in graphs categorical_features = [ 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'first_browser', 'first_device_type', 'gender', 'language', 'signup_app', 'signup_method', 'signup_flow' ]

for categorical_feature in categorical_features: train[categorical_feature] = train[categorical_feature].astype('category') for categorical_feature in categorical_features: test[categorical_feature] = test[categorical_feature].astype('category')

#One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: train_dummy = pd.get_dummies(train[f], prefix=f) train_cont= train.drop([f], axis=1) train = pd.concat((train_cont, train_dummy), axis=1)

#One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: test_dummy = pd.get_dummies(test[f], prefix=f) test_cont= test.drop([f], axis=1) test = pd.concat((test_cont, test_dummy), axis=1)

#Splitting train and test

from sklearn.model_selection import train_test_split y=train['country_destination'] X=train.drop(['country_destination','id'],axis=1) from imblearn.combine import SMOTETomek

transform the dataset
smotetomek = SMOTETomek(sampling_strategy='auto')

split the dataset into train and test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.30, random_state=1,shuffle=True) X_train, y_train = smotetomek.fit_resample(X_train1, y_train1)

#Classifier

xgb = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=0.3, max_delta_step=0, max_depth=6, min_child_weight=1, monotone_constraints=None, n_estimators=25, n_jobs=0, num_parallel_tree=1, objective='multi:softprob', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, seed=0, subsample=0.5, tree_method=None, validate_parameters=False, verbosity=None) xgb.fit(X_train, y_train) target_names = ['NDF', 'US', 'other', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU'] y_predxgb=xgb.predict(X_test1)

from sklearn.metrics import classification_report print(classification_report(y_test1, y_predxgb, target_names=target_names))",Yes,2,7.0
"import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer",No,5,22.0
"train = pd.read_csv(""../input/train.tsv"", header = 0, delimiter = '\\t')
test = pd.read_csv(""../input/test.tsv"", header = 0, delimiter = '\\t')'",No,5,45.0
"def phrase_to_words(raw_phrase):
    
    #remove any html
    phrase_text = BeautifulSoup(raw_phrase).get_text()
    
    #remove non letters
    letters = re.sub(""[^A-Za-z]"", "" "", phrase_text)
    
    #to lowercase
    lower_letters = letters.lower().split()
    
    #remove stopwords
    stop = set(stopwords.words('english'))
    meaningful_words = [word for word in lower_letters if word not in stop]
    
    return ("" "".join(meaningful_words))'",No,5,78.0
"#First the train set
num_phrase = train['Phrase'].size
clean_train_phrase = []
for i in range(0, num_phrase):
    if( (i+1)%10000 == 0 ):
        print (""Review %d of %d\
"" % ( i+1, num_phrase ))
    clean_train_phrase.append(phrase_to_words(train['Phrase'][i]))'",No,3,78.0
"print(""Making Bag of words"")
vectorizer = CountVectorizer(analyzer = ""word"",   \\
                             tokenizer = None,    \\
                             preprocessor = None, \\
                             stop_words = None,   \\
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(clean_train_phrase)
# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
'",No,5,8.0
"import time

start = time.time() # Start time

print(""Training the Random Forest"")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs = -1)
rf = rf.fit(train_data_features, train['Sentiment'])

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print (""Time taken for K Means clustering: "", elapsed, ""seconds."")'",Yes,5,7.0
test.columns.values,No,5,71.0
"num_test_phrase = test['Phrase'].size
clean_test_phrase = []
for i in range(0, num_test_phrase):
    if( (i+1)%10000 == 0 ):
        print (""Review %d of %d\
"" % ( i+1, num_test_phrase ))
    clean_test_phrase.append(phrase_to_words(test['Phrase'][i]))'",No,4,78.0
"#Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_phrase)
test_data_features = test_data_features.toarray()

#Use the random forest to make sentiment label predictions
result = rf.predict(test_data_features)

# Copy the results to a pandas dataframe with an ""id"" column and
# a ""sentiment"" column
output = pd.DataFrame( data={""PhraseId"":test[""PhraseId""], ""Sentiment"":result} )

# Use pandas to write the comma-separated output file
output.to_csv( ""Sentiment_Analysis_Movie.csv"", index=False, quoting=3 )",Yes,3,48.0
"train = pd.read_csv(""../input/train.tsv"", sep='\\t')
test = pd.read_csv(""../input/test.tsv"", sep='\\t')
#train = train[0:1000]
train['Sentiment'] = train['Sentiment'].apply(str)
'",Yes,4,45.0
"train.head()
",No,5,41.0
train.Sentiment.value_counts(),No,5,72.0
"import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))",Yes,5,23.0
test['Phrase'][0],No,5,41.0
train['Sentiment'].unique(),No,5,57.0
!pip install pytorch-pretrained-bert pytorch-nlp,No,5,87.0
"import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline",No,5,22.0
"device = torch.device(""cuda"" if torch.cuda.is_available() else ""cpu"")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)",No,5,23.0
"train['Sentiment'] = train.Sentiment.astype(int)
test['Sentiment'] = test.Sentiment.astype(int)",No,5,16.0
"# Create sentence and label lists
sentences = train.Phrase.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = [""[CLS] "" + sentence + "" [SEP]"" for sentence in sentences]
labels = train.Sentiment.values",No,5,77.0
np.unique(labels),No,5,57.0
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print (""Tokenize the first sentence:"")
print (tokenized_texts[0])'",No,4,78.0
import seaborn as sns,No,5,22.0
sns.distplot(train.Phrase.apply(lambda x: len(x.split()))),No,5,33.0
"# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 32",No,5,77.0
"# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")",No,5,78.0
"# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]",No,5,78.0
"input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")",No,5,78.0
"# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)",No,5,53.0
"# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)",No,5,13.0
train_labels,No,5,41.0
"# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)",No,5,16.0
"# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
",No,4,13.0
"# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained(""bert-base-uncased"", num_labels=5)
model.cuda()",No,5,30.0
"param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
",No,5,59.0
"# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)",No,3,59.0
"# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)",No,5,84.0
"train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc=""Epoch""):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print(""Train loss: {}"".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print(""Validation Accuracy: {}"".format(eval_accuracy/nb_eval_steps))'",No,4,7.0
"plt.figure(figsize=(15,8))
plt.title(""Training loss"")
plt.xlabel(""Batch"")
plt.ylabel(""Loss"")
plt.plot(train_loss_set)
plt.show()",No,5,35.0
"test = pd.read_csv(""../input/test.tsv"", sep='\\t')
test_id = test['PhraseId']'",No,4,45.0
"# Create sentence and label lists
sentences = test.Phrase.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = [""[CLS] "" + sentence + "" [SEP]"" for sentence in sentences]
# labels = test.Sentiment.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]",No,5,78.0
"MAX_LEN = 32
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) ",Yes,5,78.0
"prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 32  ",Yes,5,16.0
"prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)",No,4,13.0
"# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
#   true_labels.append(label_ids)",No,5,48.0
preds = np.concatenate(predictions),No,5,11.0
"# preds, target = learn_classifier.get_preds(DatasetType.Test, ordered=True)
labels = np.argmax(preds, axis =1)
",No,2,8.0
"submission = pd.DataFrame({'PhraseId': test_id, 'Sentiment': labels})
submission.to_csv('submission.csv', index=False)
submission.head()",Yes,5,25.0
"def mlen(row):
    s=row['lemma'].split(' ')
    return len(s)

train_df['len']=train_df.apply(mlen,axis=1)
train_df.head()",No,5,8.0
max(train_df['len']),No,5,40.0
"test_df['len']=test_df.apply(mlen,axis=1)
max(test_df['len'])",No,5,8.0
"from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder()
Y_train=train_df.iloc[:,3]
Y_train=Y_train.as_matrix()
Y_train=Y_train.reshape(-1,1)
Y_tr=ohe.fit_transform(Y_train)",No,5,20.0
"from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM",No,5,22.0
model.summary(),No,5,79.0
"model.fit(X_train,Y_tr,batch_size=64,epochs=15)",No,5,7.0
"s=model.predict(X_test)
s=np.argmax(s,axis=1)
print(s)
s=pd.DataFrame(s)
s['PhraseId']=test_df['PhraseId']
s.columns=['Sentiment','PhraseId']
s=s[['PhraseId','Sentiment']]
s.to_csv('submissions.csv',index=False)",No,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.",No,5,88.0
"try:
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf

# The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
# statistical natural language processing for English written in the Python programming language.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re

#TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks.
from tqdm import tqdm",No,5,84.0
"from keras.utils import to_categorical
import random
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential

#set random seed for the session and also for tensorflow that runs in background for keras
tf.random.set_seed(123)
random.seed(123)",No,5,23.0
"from zipfile import ZipFile 

for zip_path in ['../input/sentiment-analysis-on-movie-reviews/train.tsv.zip',
                 '../input/sentiment-analysis-on-movie-reviews/test.tsv.zip']:
    with ZipFile(zip_path, 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting all the files now...') 
        zip.extractall() 
        print('Done!') ",No,5,73.0
"sample = pd.read_csv(""../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv"")
train = pd.read_csv(""train.tsv"", delimiter='\\t')
test = pd.read_csv(""test.tsv"", delimiter='\\t')'",No,5,45.0
"train.shape, test.shape",No,5,58.0
"

def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub(""[^a-zA-Z]"","" "", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(len(train_sentences))
print(len(test_sentences))'",No,5,78.0
"target=train.Sentiment.values
y_target=to_categorical(target)
num_classes=y_target.shape[1]",No,5,20.0
"X_train,X_val,y_train,y_val = train_test_split(train_sentences,
                                               y_target,
                                               test_size=0.2,
                                               stratify=y_target)",No,5,13.0
"unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words gives the no of unique words
print(len(list(unique_words)))
print(len_max)",No,5,54.0
"history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=6, batch_size=256, verbose=1, callbacks=callback)",No,5,7.0
"import matplotlib.pyplot as plt

# Create count of the number of epochs
epoch_count = range(1, len(history.history['loss']) + 1)

# Visualize learning curve. Here learning curve is not ideal. It should be much smoother as it decreases.
#As mentioned before, altering different hyper parameters especially learning rate can have a positive impact
#on accuracy and learning curve.
plt.plot(epoch_count, history.history['loss'], 'r--')
plt.plot(epoch_count, history.history['val_loss'], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()",No,5,35.0
"#make the predictions with trained model and submit the predictions.
y_pred=model.predict_classes(X_test)",No,5,48.0
"sample.Sentiment=y_pred
sample.to_csv('Submission.csv',index=False)",No,5,25.0
!pip install -U sentence-transformers,No,5,87.0
"from sentence_transformers import SentenceTransformer

#there are about 10 pretrained models
#roberta-large-nli-stsb-mean-tokens - returns 1024 dimentional vector
#distilbert-base-nli-stsb-mean-tokens - returns 768 dimentional vector

PRETRAINED_MODEL='roberta-large-nli-stsb-mean-tokens'    # 'distilbert-base-nli-stsb-mean-tokens'        
model = SentenceTransformer(PRETRAINED_MODEL)  
",No,5,30.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 
from sklearn import preprocessing
import os
from timeit import default_timer
import tensorflow as tf
import tensorflow.keras as keras
import regex as re
import lightgbm as lgbm",No,5,22.0
"DATA_ROOT = Path("".."") / ""/kaggle/input/sentiment-analysis-on-movie-reviews""
train = pd.read_csv(DATA_ROOT / 'train.tsv.zip', sep=""\\t"")
test = pd.read_csv(DATA_ROOT / 'test.tsv.zip', sep=""\\t"")
print(train.shape,test.shape)
train.head()'",No,4,45.0
"b""\n#add two simple features - number of chars and words\ndef add_features (df):\n    df['nwords'] = df.Phrase.apply(lambda text: len(re.findall(r'\\w+', text)))\n    df['nchars'] = df.Phrase.apply(lambda text: len(text))""",No,5,8.0
"X_train = np.array(train_embedd)
X_test = np.array(test_embedd)
X_train = np.concatenate((X_train, train[['nchars','nwords']].values), axis=1)
X_test = np.concatenate((X_test, test[['nchars','nwords']].values), axis=1)

X_train.shape, X_test.shape",No,4,11.0
"#convert labels into 5-dimentional vector

enc = preprocessing.OneHotEncoder()
label = train['Sentiment'].values.reshape ((-1,1))
enc.fit(label)
y_train = enc.transform(label).toarray()
y_train.shape",No,4,8.0
"y_preds = model.predict(X_test)
y_preds.shape",No,5,48.0
"sample_submission = pd.read_csv(DATA_ROOT / 'sampleSubmission.csv')
sample_submission['Sentiment'] = np.argmax(y_preds,axis=1)
sample_submission.to_csv(""predictions.csv"", index=False)'",No,4,25.0
sample_submission.tail(),No,5,41.0
"import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GRU, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical",No,5,22.0
"b""df_train = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep='\\t', usecols=['Phrase', 'Sentiment'])\ndf_submission = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep='\\t', usecols=['Phrase'])""",No,5,45.0
"X_train, X_test, y_train, y_test = train_test_split(df_train['Phrase'].values, df_train['Sentiment'].values, test_size=0.1)",No,5,13.0
"# initialize Tokenizer to encode strings into integers
tokenizer = Tokenizer()

# calculate number of rows in our dataset
num_rows = df_train.shape[0]

# create vocabulary from all words in our dataset for encoding
tokenizer.fit_on_texts(df_train['Phrase'].values)

# max length of 1 row (number of words)
row_max_length = max([len(x.split()) for x in df_train['Phrase'].values])

# count number of unique words
vocabulary_size = len(tokenizer.word_index) + 1

# convert words into integers
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)
X_sub_tokens = tokenizer.texts_to_sequences(df_submission['Phrase'].values)

# ensure every row has same size - pad missing with zeros
X_train_pad = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')
X_sub_pad = pad_sequences(X_sub_tokens, maxlen=row_max_length, padding='post')",Yes,4,8.0
"y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

target_length = y_train_cat.shape[1]
print('Original vector size: {}'.format(y_train.shape))
print('Converted vector size: {}'.format(y_train_cat.shape))",No,4,58.0
"EMBEDDING_DIM = 256

model = Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(GRU(128)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(target_length, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
history = model.fit(X_train_pad, y_train_cat, epochs=5, validation_data=(X_test_pad, y_test_cat), batch_size=128, callbacks=[callback])",No,4,7.0
"# predict test data
y_sub_hat_ = model.predict(X_sub_pad)
y_sub_hat = [np.argmax(x) for x in y_sub_hat_]

# save to csv
df_save = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')
df_save['Sentiment'] = y_sub_hat
df_save.to_csv('Submission.csv', index = False)
print('Submission saved!')",Yes,4,48.0
"from pathlib import Path 
DATA_ROOT = Path("".."") / ""/kaggle/input/sentiment-analysis-on-movie-reviews""
train = pd.read_csv(DATA_ROOT / 'train.tsv.zip', sep=""\\t"")
test = pd.read_csv(DATA_ROOT / 'test.tsv.zip', sep=""\\t"")
print(train.shape,test.shape)
train.head()'",No,4,45.0
"np.random.seed(2000)
w2 = np.random.randn(S+1,5,)
#w1 = np.zeros([S+1,5])",No,5,77.0
test_x_two[:5],No,5,41.0
loss_list = [loss],No,5,77.0
"tmp = PhraseId
tmp.shape",No,5,58.0
cls.shape,No,5,58.0
cls,No,5,53.0
c = np.unique(b),No,5,57.0
len(c),No,5,53.0
d=np.array(test_sentenceId),No,5,77.0
e = np.unique(d),No,5,77.0
len(e),No,5,53.0
ans = np.array(ans),No,4,16.0
a = test['PhraseId'],No,5,77.0
"df = pd.DataFrame({'PhraseId':a,'Sentiment':ans})",No,5,12.0
"df.to_csv(""submission.csv"",index=False,sep=',')'",No,5,25.0
"data = pd.read_csv(""/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip"",sep='\\t')
data.head(10)'",No,5,45.0
data.shape,No,5,58.0
"x_train = []
x_train = data['Phrase']
print(type(x_train))
x_train = np.asarray(x_train)
print(type(x_train))
x_train.shape",No,4,21.0
"y_train=[]
y_train = data['Sentiment']
y_train = np.asarray(y_train)
print(type(y_train))
y_train.shape",No,4,21.0
"x_train = [rev.replace("","","""").replace(""."","""").lower() for rev in x_train]",No,5,78.0
"Y_train = np.zeros((y_train.size, y_train.max()+1))
Y_train[np.arange(y_train.size),y_train] = 1
Y_train",No,5,53.0
Y_train.shape,No,5,58.0
"review_max_len = 200
vocab_size = 5000",No,5,77.0
"from keras.preprocessing.text import one_hot

x_train_num = [one_hot(i,vocab_size) for i in x_train]",No,5,20.0
X_train.shape,No,5,58.0
"from keras.models import Sequential
from keras.layers import LSTM,Dense,Conv1D,MaxPool1D
from keras.layers.embeddings import Embedding",No,5,22.0
"model.fit(X_train,Y_train,
         batch_size=256,
         epochs=10,
         verbose=2)",No,5,7.0
"test = pd.read_csv(""/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip"",sep=""\\t"")
test'",No,5,45.0
"x_test = []
x_test = test['Phrase']
print(type(x_train))
x_test = np.asarray(x_test)
print(type(x_test))
x_test.shape",No,5,21.0
"x_test = [rev.replace("","","""").replace(""."","""").lower() for rev in x_test]",No,5,78.0
"predicts = model.predict(X_test,
                        batch_size=256,
                        verbose=2)",No,5,48.0
predicts.shape,No,5,58.0
"out = pd.DataFrame(data=test.PhraseId,columns=['PhraseId'])
out",No,5,12.0
"out['Sentiment'] = preds_final
out",No,5,55.0
"out.to_csv('../working/submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only ""../input/"" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_zip_file = '/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip'
test_zip_file = '/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip'


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,4,88.0
"b""\ndf_train = pd.read_csv(train_zip_file,sep='\\t')\ndf_train.head()""",No,4,45.0
"b""\ndf_test = pd.read_csv(test_zip_file, sep='\\t')\ndf_test.head()\n""",No,4,45.0
"import string
from nltk.corpus import stopwords
",No,5,22.0
"def cleanup_message(message):
    messsage_pun =  [ char for char in message  if char not in string.punctuation]
    message_punc_join = ''.join(messsage_pun)
    message_punc_join_clean = [ word for word in message_punc_join.split() if word.lower() not in stopwords.words('english')]
    return message_punc_join_clean",No,5,78.0
"from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=cleanup_message)
train_vector = vectorizer.fit_transform(df_train['Phrase'])",No,4,22.0
from sklearn.naive_bayes import MultinomialNB,No,5,22.0
multinomialnb = MultinomialNB(),No,5,4.0
 labels = df_train['Sentiment'].values,No,5,77.0
"multinomialnb.fit(train_vector,labels)",No,5,7.0
"pharaseid = df_test[""PhraseId""]",No,5,77.0
pharaseid,No,5,53.0
y1 =  multinomialnb.predict(test_vector),No,5,48.0
y1,No,3,41.0
"dataframe=pd.DataFrame(y1, columns=['Sentiment']) ",No,5,12.0
dataframe,No,5,12.0
"result = pd.concat([pharaseid, dataframe], axis=1)",No,5,11.0
"result.to_csv('sentiment_submission.csv',index=False)",No,5,25.0
"b""train = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', '\\t')""",No,5,45.0
"b""test = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip','\\t')""",No,5,45.0
train.isnull().sum() #no missing values,No,5,39.0
test.isnull().sum() #no missing values,No,5,39.0
"import seaborn as sns

sns.set()
sns.countplot(x='Sentiment',data=train)",No,4,33.0
"#drop unecessary columns
train.drop(['PhraseId','SentenceId'],inplace = True,axis='columns')",No,5,10.0
"from keras.preprocessing.text import text_to_word_sequence

#convert sentences to tokenized words
for i in range(len(train['Phrase'])):
    train['Phrase'][i] = text_to_word_sequence(train['Phrase'][i])
    ",No,5,78.0
"from keras.preprocessing.text import Tokenizer

#convert tokenized words to numeric form required for model building
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train['Phrase'])

train['Phrase'] = tokenizer.texts_to_sequences(train['Phrase'])",No,5,8.0
"submission = pd.DataFrame()
submission['PhraseId'] = test['PhraseId']",No,5,55.0
test.drop(['PhraseId''SentenceId']),No,5,10.0
"X = train_copy
y = pd.get_dummies(train['Sentiment'])",No,5,21.0
"from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)",No,5,13.0
"train_history=model.fit(x=X_train,y=y_train,batch_size=64,epochs=10,verbose=2,validation_data=(X_val,y_val))",No,5,7.0
"import matplotlib.pyplot as plt

plt.figure(figsize=(5,5))
plt.plot(train_history.history['accuracy'],'r',label='Training accuracy')
plt.plot(train_history.history['val_accuracy'],'b',label='Validation accuracy')
plt.legend()",No,5,35.0
prediction = model.predict(test_copy),No,5,48.0
final_prediction = [np.argmax(i) for i in prediction],No,5,77.0
"submission['Sentiment'] = final_prediction
submission.head()",No,4,41.0
"submission.to_csv('../working/submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns; sns.set(style=""ticks"", color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0
"dataset = pd.read_csv(""../input/train.csv"", names=['Store','Dept','Date','weeklySales','isHoliday'],sep=',', header=0)
features = pd.read_csv(""../input/features.csv"",sep=',', header=0,
                       names=['Store','Date','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4',
                              'MarkDown5','CPI','Unemployment','IsHoliday']).drop(columns=['IsHoliday'])
stores = pd.read_csv(""../input/stores.csv"", names=['Store','Type','Size'],sep=',', header=0)
dataset = dataset.merge(stores, how='left').merge(features, how='left')

# dataset[""nextWeekHoliday""] = dataset[""isHoliday""].shift(-1).fillna(False)
# dataset[""next2WeekHoliday""] = dataset[""isHoliday""].shift(-2).fillna(False)
dataset'",No,4,45.0
"def scatter(dataset, column):
    plt.figure()
    plt.scatter(dataset[column] , dataset['weeklySales'])
    plt.ylabel('weeklySales')
    plt.xlabel(column)",No,5,33.0
"scatter(dataset, 'Fuel_Price')
scatter(dataset, 'Size')
scatter(dataset, 'CPI')
scatter(dataset, 'Type')
scatter(dataset, 'isHoliday')
scatter(dataset, 'Unemployment')
scatter(dataset, 'Temperature')
scatter(dataset, 'Store')
scatter(dataset, 'Dept')",No,5,33.0
"def calculate_error(test_y, predicted, weights):
    return mean_absolute_error(test_y, predicted, sample_weight=weights)",No,5,84.0
"dataset_test = pd.get_dummies(dataset_test, columns=[""Type""])
dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset_test = dataset_test.fillna(0)
column_date = dataset_test['Date']
dataset_test['Month'] = pd.to_datetime(dataset_test['Date']).dt.month
dataset_test = dataset_test.drop(columns=[""Date"", ""CPI"", ""Fuel_Price"", 'Unemployment', 'MarkDown3'])
dataset_test'",No,4,17.0
predicted_test = best_model.predict(dataset_test),No,5,48.0
"dataset_test['weeklySales'] = predicted_test
dataset_test['Date'] = column_date
dataset_test['id'] = dataset_test['Store'].astype(str) + '_' +  dataset_test['Dept'].astype(str) + '_' +  dataset_test['Date'].astype(str)
dataset_test = dataset_test[['id', 'weeklySales']]
dataset_test = dataset_test.rename(columns={'id': 'Id', 'weeklySales': 'Weekly_Sales'})",No,3,8.0
"dataset_test.to_csv('output.csv', index=False)",No,5,25.0
"train=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv"")
train.head()",No,4,45.0
"test=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv"")
test.head()",No,4,45.0
"y=train[""Weekly_Sales""]",No,5,21.0
"train[""Date""]=train[""Date""].astype(""datetime64"")
test[""Date""]=test[""Date""].astype(""datetime64"")
train.dtypes",No,4,16.0
"#train[""day of week""]=train[""Date""].dt.dayofweek
#test[""day of week""]=test[""Date""].dt.dayofweek
train[""month""]=train[""Date""].dt.month
test[""month""]=test[""Date""].dt.month
train[""day""]=train[""Date""].dt.day
test[""day""]=test[""Date""].dt.day
train[""year""]=train[""Date""].dt.year
test[""year""]=test[""Date""].dt.year",No,5,8.0
"import matplotlib.pyplot as plt
import seaborn as sns
A,B=plt.subplots(1,1,figsize=(10,10))
sns.boxplot(train[""month""],train[""Weekly_Sales""])",No,5,75.0
"train=pd.merge(train,store)
test=pd.merge(test,store)",No,5,32.0
"train[""Type""].unique()",No,5,57.0
"train[""Type""]=train[""Type""].replace({""A"":1,""B"":2,""C"":3})
test[""Type""]=test[""Type""].replace({""A"":1,""B"":2,""C"":3})",No,5,20.0
"train=train.drop([""Weekly_Sales"",""Date""],axis=1)
train.head()",No,4,10.0
"tesst=test.drop([""Date""],axis=1)
test.head()",No,5,10.0
"#from lightgbm import LGBMRegressor 
#lgb=LGBMRegressor()
#lgb.fit(train,y)

from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf.fit(train,y)",No,5,7.0
"sample=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv"")
sample.head()",No,4,7.0
"sample[""Weekly_Sales""]=value",No,5,8.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from scipy.stats import zscore, boxcox
from sklearn.preprocessing import QuantileTransformer, LabelEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

## NN ##
from keras.models import Sequential
from keras.layers import Dense as Dense2
from keras.wrappers.scikit_learn import KerasClassifier
from keras import regularizers
from keras import callbacks

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
#######


sns.set_style('darkgrid')

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
pd.options.mode.chained_assignment = None'",No,4,88.0
pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv').head(),No,5,45.0
"X_train = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv')
stores = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv')
features = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv')
X_train = X_train.merge(stores, on='Store').merge(features.drop('IsHoliday', axis=1), on=['Store', 'Date'])
X_train.fillna(0, inplace=True) ## only NaN columns are the markdowns. Set to 0 if NaN ##
X_train['Date'] = pd.to_datetime(X_train['Date'])
# X_train_no_neg = X_train[X_train['Weekly_Sales'] >= 0].reset_index(drop=True)

# y_train = X_train[['Store', 'Dept', 'Date', 'Weekly_Sales']]
# X_train.drop('Weekly_Sales', axis=1, inplace=True)

# y_train_no_neg = X_train_no_neg[['Store', 'Dept', 'Date', 'Weekly_Sales']]
# X_train_no_neg.drop('Weekly_Sales', axis=1, inplace=True)

# print(X_train.shape, X_train_no_neg.shape, (X_train.shape[0] - X_train_no_neg.shape[0]) / X_train.shape[0])

X_train.head()",No,4,45.0
X_train['Date'].dtype,No,5,70.0
"train=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip"")
train.head()",Yes,4,45.0
"test=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip"")
test.head()",Yes,5,45.0
"tesst=test.drop([""Date""],axis=1)
tesst.head()",Yes,3,10.0
"sample=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip"")
sample.head()",Yes,4,7.0
"sample.to_csv(""20200309.csv"",index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
os.cpu_count()
# Any results you write to the current directory are saved as output.",Yes,5,88.0
"train = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip"",parse_dates=[""Date""])
test = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip"",parse_dates=[""Date""])",No,5,45.0
"add = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip"")
add.head()",Yes,4,45.0
"store = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv"")
store",Yes,5,45.0
"import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.barplot(x=""Type"", y=""Size"", hue=""Type"", data=store)
sns.catplot(x=""Type"", kind=""count"", palette=""ch:.25"", data=store)",No,5,33.0
"train = pd.merge(train,store,on = ""Store"",how=""left"")
test = pd.merge(test,store,on = ""Store"",how=""left"")",No,5,32.0
"train[""year""] = train[""Date""].dt.year
train[""month""] = train[""Date""].dt.month
train[""day""] = train[""Date""].dt.day
train[""week""] = train[""Date""].dt.week
test[""year""] = test[""Date""].dt.year
test[""month""] = test[""Date""].dt.month
test[""day""] = test[""Date""].dt.day
test[""week""] = test[""Date""].dt.week",No,5,8.0
"train[""Type""] = train[""Type""].replace({""A"":0,""B"":1,""C"":2})
test[""Type""] = test[""Type""].replace({""A"":0,""B"":1,""C"":2})",No,5,20.0
"def wmae(y_pred, targ, holiday_week):
    sumOfWeights = 0
    sumofCol_B_X_Col_E = 0
    
    for i in range(0, len(y_pred)):
        weight = 0
        if holiday_week[i]: 
            weight = 5
        else:
            weight = 1
        
        Col_B_X_Col_E = abs(targ[i] - y_pred[i])*weight
        sumOfWeights += weight 
        sumofCol_B_X_Col_E += Col_B_X_Col_E
    WMAE = sumofCol_B_X_Col_E/sumOfWeights
    return WMAE",No,5,84.0
"train.groupby(""IsHoliday"")[""Weekly_Sales""].median()",No,3,40.0
"plt.figure(figsize=(10,6))
sns.barplot(train[""IsHoliday""],train[""Weekly_Sales""])",No,5,33.0
"train.groupby(""Type"")[""Weekly_Sales""].mean()",No,4,40.0
"plt.figure(figsize=(10,6))
sns.barplot(train[""Type""],train[""Weekly_Sales""])",No,5,33.0
"plt.figure(figsize=(10,6))
sns.boxplot(train[""year""],train[""Weekly_Sales""],showfliers=False)",No,5,75.0
"plt.figure(figsize=(10,6))
sns.boxplot(train[""Store""],train[""Weekly_Sales""],showfliers=False)",No,5,33.0
"plt.figure(figsize=(10,6))
sns.boxplot(train[""Dept""],train[""Weekly_Sales""],showfliers=False)",No,5,33.0
"feature = [""Store"",""Dept"",""year"",""month"",""day"",""week"",""IsHoliday"", ""Size""]",No,5,77.0
"X_train = train[feature]
X_test = test[feature]",No,5,21.0
X_train,No,5,41.0
"y_train = train[""Weekly_Sales""]",No,5,21.0
"from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train,y_train, test_size=0.2)",Yes,5,13.0
"%%time
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10,n_jobs=4)
",Yes,5,4.0
"model.fit(X_train1,y_train1)",No,5,7.0
"(pd.DataFrame([X_train.columns,model.feature_importances_],columns=feature).T).plot.bar()",No,5,79.0
"holidays = X_test1['IsHoliday'].to_numpy()
y_test1 = y_test1.to_numpy()",No,5,16.0
result = model.predict(X_test1),No,5,48.0
"WMAE = wmae(result, y_test1, holidays)
print(WMAE)",No,5,49.0
"'''from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 100, 200]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = 3, verbose = 2)'''

",Yes,4,5.0
grid_search.best_params_,No,5,2.0
"data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)",No,5,84.0
"xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
 #               max_depth = 5, alpha = 10, n_estimators = 10)",No,5,4.0
"xg_reg.fit(X_train1,y_train1)
result = xg_reg.predict(X_test1)",Yes,4,7.0
"
%%time
from sklearn.ensemble import RandomForestRegressor
#model = RandomForestRegressor(bootstrap= True, max_depth= 110, max_features= 3, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 200)

model = RandomForestRegressor(n_estimators= 500)

model.fit(X_train,y_train)

result = model.predict(X_test)",Yes,4,7.0
"sub = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip"")",No,5,45.0
"sub[""Weekly_Sales""] = result
sub.head()",Yes,4,55.0
"sub.to_csv(""walmart_predict_sub.csv"",index=False)",No,5,25.0
"import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import sys
import re
from datetime import datetime
import math

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,4,22.0
"pd.options.display.max_colwidth = 400
from IPython.display import FileLink, FileLinks",Yes,5,23.0
"train = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/train_2.csv.zip')
keys = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/key_2.csv.zip')
sub = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/sample_submission_2.csv.zip')",No,5,45.0
keys.head(),No,5,41.0
"def split_page_col(page):
    tokens = page.split('_')
    article_name = ''.join(tokens[:-3])
    org = tokens[-3]
    access = tokens[-2]
    crawler = tokens[-1]
    return (article_name, org, access, crawler)

def split_page_col_wdate(page):
    tokens = page.split('_')
    article_name = ''.join(tokens[:-4])
    org = tokens[-4]
    access = tokens[-3]
    crawler = tokens[-2]
    date = tokens[-1]
    return (article_name, org, access, crawler,date)",No,5,78.0
"keys['date'] = keys['Page'].apply(lambda x: x.split('_')[-1])
keys['Page'] = keys['Page'].apply(lambda x: '_'.join(x.split('_')[:-1]))
keys['date'] = pd.to_datetime(keys['date'], format='%Y-%m-%d')",No,5,8.0
keys.tail(),No,5,41.0
"sub = sub.merge(keys, on='Id', how='left')",No,5,32.0
"sub['date'] = pd.to_datetime(sub['date'], format='%Y-%m-%d')",No,5,16.0
"print(sub['date'].min(), sub['date'].max())
print(sub['date'].max() - sub['date'].min())",No,3,40.0
"print(sub['Page'].nunique())
print(train['Page'].nunique())",No,5,54.0
"train.iloc[:, 755:803]",No,5,14.0
"print(sub.shape)
print(train.shape)",No,5,58.0
"prev_year_data_cols = pd.date_range('2016-09-13', '2016-11-13')
train_flat = pd.melt(train.loc[:, ['Page'] + list(prev_year_data_cols.date.astype(str))], id_vars='Page', var_name='date')
train_flat['date'] = pd.to_datetime(train_flat['date'], format='%Y-%m-%d')",Yes,4,16.0
train_flat.head(),No,5,41.0
"train_flat['prediction_date'] = train_flat['date'] + pd.DateOffset(years=1)
sub = sub[['Page', 'date', 'Id']].merge(train_flat[['Page', 'prediction_date', 'value']], left_on=('Page', 'date'), right_on=('Page', 'prediction_date'))
sub['value'] = sub['value'].fillna(0)",Yes,3,32.0
"sub[['Id', 'value']].rename(columns={'value': 'visits'}).to_csv('all_submission.csv', index=False)
FileLink('all_submission.csv')",Yes,5,25.0
"page_median = train.iloc[:, 1:].median(axis=1, skipna=True)",No,3,40.0
"page_median = pd.DataFrame({'Page': train['Page'], 'median': page_median})",No,5,12.0
page_median.head(),No,5,41.0
"sub_median = sub.merge(page_median, on='Page')[['Id', 'median']]",No,5,32.0
sub_median.isnull().mean(),No,3,40.0
sub_median,No,5,53.0
"sub_median.rename(columns={'median': 'visits'}).to_csv('submission.csv', index=False)
FileLink('submission.csv')",Yes,5,25.0
"prev_year_data_cols = pd.date_range('2016-09-13', '2016-11-13')
prev_year_median = train.loc[:, list(prev_year_data_cols.date.astype(str))].median(axis=1, skipna=True)
",Yes,5,77.0
"prev_year_median = pd.DataFrame({'Page': train['Page'], 'visits': prev_year_median})",No,5,12.0
"sub_prev_year_median = sub.merge(prev_year_median, on='Page')[['Id', 'visits']]",No,5,32.0
sub_prev_year_median.isnull().mean(),No,4,57.0
sub_prev_year_median['visits'] = sub_prev_year_median['visits'].fillna(sub_prev_year_median['visits'].median()),No,5,17.0
"median_60 = train.iloc[:, -60:].median(axis=1, skipna=True)
median_60 = pd.DataFrame({'Page': train['Page'], 'visits': median_60})
sub_median_60 = sub.merge(median_60, on='Page')[['Id', 'visits']]",Yes,4,12.0
sub_median_60.isnull().mean(),No,3,57.0
sub_median_60['visits'] = sub_median_60['visits'].fillna(0),No,5,17.0
"# Libraries
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline
import copy
import datetime
import lightgbm as lgb
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, classification_report, confusion_matrix
import json
import ast
import time
from sklearn import linear_model

import warnings
warnings.filterwarnings('ignore')

import os
import glob

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder",Yes,5,23.0
"class LGBWrapper(object):
    """"""
    A wrapper for lightgbm model so that we will have a single api for various models.
    """"""

    def __init__(self):
        self.model = lgb.LGBMClassifier()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        eval_set = [(X_train, y_train)]
        eval_names = ['train']
        self.model = self.model.set_params(**params)

        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
            eval_names.append('valid')

        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))
            eval_names.append('holdout')

        if 'cat_cols' in params.keys():
            cat_cols = [col for col in params['cat_cols'] if col in X_train.columns]
            if len(cat_cols) > 0:
                categorical_columns = params['cat_cols']
            else:
                categorical_columns = 'auto'
        else:
            categorical_columns = 'auto'

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        self.best_score_ = self.model.best_score_
        self.feature_importances_ = self.model.feature_importances_

    def predict_proba(self, X_test):
        if self.model.objective == 'binary':
            return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)[:, 1]
        else:
            return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)'",Yes,5,53.0
"class ClassifierModel(object):
    """"""
    A wrapper class for classification models.
    It can be used for training and prediction.
    Can plot feature importance and training progress (if relevant for model).

    """"""

    def __init__(self, columns: list = None, model_wrapper=None):
        """"""

        :param original_columns:
        :param model_wrapper:
        """"""
        self.columns = columns
        self.model_wrapper = model_wrapper
        self.result_dict = {}
        self.train_one_fold = False
        self.preprocesser = None

    def fit(self, X: pd.DataFrame, y,
            X_holdout: pd.DataFrame = None, y_holdout=None,
            folds=None,
            params: dict = None,
            eval_metric='auc',
            cols_to_drop: list = None,
            preprocesser=None,
            transformers: dict = None,
            adversarial: bool = False,
            plot: bool = True):
        """"""
        Training the model.

        :param X: training data
        :param y: training target
        :param X_holdout: holdout data
        :param y_holdout: holdout target
        :param folds: folds to split the data. If not defined, then model will be trained on the whole X
        :param params: training parameters
        :param eval_metric: metric for validataion
        :param cols_to_drop: list of columns to drop (for example ID)
        :param preprocesser: preprocesser class
        :param transformers: transformer to use on folds
        :param adversarial
        :return:
        """"""
        self.cols_to_drop = cols_to_drop

        if folds is None:
            folds = KFold(n_splits=3, random_state=42)
            self.train_one_fold = True

        self.columns = X.columns if self.columns is None else self.columns
        self.feature_importances = pd.DataFrame(columns=['feature', 'importance'])
        self.trained_transformers = {k: [] for k in transformers}
        self.transformers = transformers
        self.models = []
        self.folds_dict = {}
        self.eval_metric = eval_metric
        n_target = 1 if len(set(y.values)) == 2 else len(set(y.values))
        self.oof = np.zeros((len(X), n_target))
        self.n_target = n_target

        X = X[self.columns]
        if X_holdout is not None:
            X_holdout = X_holdout[self.columns]

        if preprocesser is not None:
            self.preprocesser = preprocesser
            self.preprocesser.fit(X, y)
            X = self.preprocesser.transform(X, y)
            self.columns = X.columns.tolist()
            if X_holdout is not None:
                X_holdout = self.preprocesser.transform(X_holdout)
            # y = X['accuracy_group']

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
            if X_holdout is not None:
                X_hold = X_holdout.copy()
            else:
                X_hold = None
            self.folds_dict[fold_n] = {}
            if params['verbose']:
                print(f'Fold {fold_n + 1} started at {time.ctime()}')
            self.folds_dict[fold_n] = {}

            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            if self.train_one_fold:
                X_train = X[self.original_columns]
                y_train = y
                X_valid = None
                y_valid = None

            datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train}
            X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop)

            self.folds_dict[fold_n]['columns'] = X_train.columns.tolist()

            model = copy.deepcopy(self.model_wrapper)

            if adversarial:
                X_new1 = X_train.copy()
                if X_valid is not None:
                    X_new2 = X_valid.copy()
                elif X_holdout is not None:
                    X_new2 = X_holdout.copy()
                X_new = pd.concat([X_new1, X_new2], axis=0)
                y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0]))))
                X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new)

            model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params)

            self.folds_dict[fold_n]['scores'] = model.best_score_
            if self.oof.shape[0] != len(X):
                self.oof = np.zeros((X.shape[0], self.oof.shape[1]))
            if not adversarial:
                self.oof[valid_index] = model.predict_proba(X_valid).reshape(-1, n_target)

            fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)),
                                           columns=['feature', 'importance'])
            self.feature_importances = self.feature_importances.append(fold_importance)
            self.models.append(model)

        self.feature_importances['importance'] = self.feature_importances['importance'].astype(float)

        # if params['verbose']:
        self.calc_scores_()

        if plot:
#             print(classification_report(y, self.oof.argmax(1)))
            print(classification_report(y, (self.oof > 0.5) * 1))
            fig, ax = plt.subplots(figsize=(16, 12))
            plt.subplot(2, 2, 1)
            self.plot_feature_importance(top_n=25)
            plt.subplot(2, 2, 2)
            self.plot_metric()
            plt.subplot(2, 2, 3)
            g = sns.heatmap(confusion_matrix(y, (self.oof > 0.5) * 1), annot=True, cmap=plt.cm.Blues,fmt=""d"")
            g.set(ylim=(-0.5, 4), xlim=(-0.5, 4), title='Confusion matrix')

            plt.subplot(2, 2, 4)
            plt.hist(self.oof)
            plt.xticks(range(self.n_target), range(self.n_target))
            plt.title('Distribution of oof predictions');

    def transform_(self, datasets, cols_to_drop):
        for name, transformer in self.transformers.items():
            transformer.fit(datasets['X_train'], datasets['y_train'])
            datasets['X_train'] = transformer.transform(datasets['X_train'])
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = transformer.transform(datasets['X_valid'])
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = transformer.transform(datasets['X_holdout'])
            self.trained_transformers[name].append(transformer)
        if cols_to_drop is not None:
            cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns]
            self.cols_to_drop = cols_to_drop
            datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1)
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1)
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1)

        return datasets['X_train'], datasets['X_valid'], datasets['X_holdout']

    def calc_scores_(self):
        print()
        datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0]
        self.scores = {}
        for d in datasets:
            scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()]
            print(f""CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std."")
            self.scores[d] = np.mean(scores)

    def predict(self, X_test, averaging: str = 'usual'):
        """"""
        Make prediction

        :param X_test:
        :param averaging: method of averaging
        :return:
        """"""
        full_prediction = np.zeros((X_test.shape[0], self.oof.shape[1]))
        if self.preprocesser is not None:
            X_test = self.preprocesser.transform(X_test)
        for i in range(len(self.models)):
            X_t = X_test.copy()
            for name, transformers in self.trained_transformers.items():
                X_t = transformers[i].transform(X_t)
            if self.cols_to_drop:
                cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns]
                X_t = X_t.drop(cols_to_drop, axis=1)
            y_pred = self.models[i].predict_proba(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1])

            # if case transformation changes the number of the rows
            if full_prediction.shape[0] != len(y_pred):
                full_prediction = np.zeros((y_pred.shape[0], self.oof.shape[1]))

            if averaging == 'usual':
                full_prediction += y_pred
            elif averaging == 'rank':
                full_prediction += pd.Series(y_pred).rank().values

        return full_prediction / len(self.models)

    def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):
        """"""
        Plot default feature importance.

        :param drop_null_importance: drop columns with null feature importance
        :param top_n: show top n columns
        :return:
        """"""

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
        """"""
        Get top features by importance.

        :param drop_null_importance:
        :param top_n:
        :return:
        """"""
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        """"""
        Plot training progress.
        Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html

        :return:
        """"""
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        full_evals_results[self.eval_metric] = np.abs(full_evals_results[self.eval_metric])
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.title('Training progress')'",Yes,5,53.0
"data_dict = {}
for i in glob.glob('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/*'):
    name = i.split('/')[-1].split('.')[0]
    if name != 'MTeamSpellings':
        data_dict[name] = pd.read_csv(i)
    else:
        data_dict[name] = pd.read_csv(i, encoding='cp1252')",No,3,45.0
data_dict.keys(),No,2,40.0
data_dict['MNCAATourneySeeds'].head(),No,5,41.0
data_dict['MNCAATourneyCompactResults'].head(),No,5,41.0
"data_dict['MNCAATourneyCompactResults'].groupby(['Season'])['WScore'].mean().plot(kind='line');
plt.title('Mean scores of winning teams by season in tourneys');",No,5,75.0
data_dict['MRegularSeasonCompactResults'],No,5,41.0
"data_dict['MRegularSeasonCompactResults'].groupby(['Season'])['WScore'].mean().plot();
plt.title('Mean scores of winning teams by season in regular plays');",No,5,75.0
"# process seed
data_dict['MNCAATourneySeeds'] = data_dict['MNCAATourneySeeds'].loc[data_dict['MNCAATourneySeeds']['Season'] <= 2014]
data_dict['MNCAATourneySeeds']['Seed'] = data_dict['MNCAATourneySeeds']['Seed'].apply(lambda x: int(x[1:3]))
# take only useful columns
data_dict['MNCAATourneySeeds'] = data_dict['MNCAATourneySeeds'][['Season', 'TeamID', 'Seed']]
data_dict['MNCAATourneyCompactResults'] = data_dict['MNCAATourneyCompactResults'][['Season','WTeamID', 'LTeamID']]
data_dict['MNCAATourneyCompactResults'] = data_dict['MNCAATourneyCompactResults'].loc[data_dict['MNCAATourneyCompactResults']['Season'] <= 2014]
# merge the data and rename the columns
df = pd.merge(data_dict['MNCAATourneyCompactResults'], data_dict['MNCAATourneySeeds'],
              how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df = pd.merge(df, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df = df.drop(['TeamID_x', 'TeamID_y'], axis=1)
df.columns = ['Season', 'WTeamID', 'LTeamID', 'WSeed', 'LSeed']
df.head()",Yes,3,8.0
"team_win_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'WTeamID']).agg({'WScore':['sum', 'count']}).reset_index()
team_win_score.columns = ['Season', 'WTeamID', 'WScore_sum', 'WScore_count']
team_loss_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'LTeamID']).agg({'LScore':['sum', 'count']}).reset_index()
team_loss_score.columns = ['Season', 'LTeamID', 'LScore_sum', 'LScore_count']
df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'WTeamID'])
df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'LTeamID'])
df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'LTeamID'])
df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'LTeamID_x'], right_on=['Season', 'WTeamID'])
df.drop(['LTeamID_y', 'WTeamID_y'], axis=1, inplace=True)
df.head()",Yes,3,32.0
"df_win = df.copy()
df_los = df.copy()
df_win = df_win[['WSeed', 'LSeed', 'x_score', 'y_score', 'x_count', 'y_count']]
df_los = df_los[['LSeed', 'WSeed', 'y_score', 'x_score', 'y_count', 'x_count']]
df_win.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']
df_los.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']",Yes,4,12.0
"df_win['Seed_diff'] = df_win['Seed_1'] - df_win['Seed_2']
df_win['Score_diff'] = df_win['Score_1'] - df_win['Score_2']
df_los['Seed_diff'] = df_los['Seed_1'] - df_los['Seed_2']
df_los['Score_diff'] = df_los['Score_1'] - df_los['Score_2']

df_win['Count_diff'] = df_win['Count_1'] - df_win['Count_2']
df_win['Mean_score1'] = df_win['Score_1'] / df_win['Count_1']
df_win['Mean_score2'] = df_win['Score_2'] / df_win['Count_2']
df_win['Mean_score_diff'] = df_win['Mean_score1'] - df_win['Mean_score2']
df_los['Count_diff'] = df_los['Count_1'] - df_los['Count_2']
df_los['Mean_score1'] = df_los['Score_1'] / df_los['Count_1']
df_los['Mean_score2'] = df_los['Score_2'] / df_los['Count_2']
df_los['Mean_score_diff'] = df_los['Mean_score1'] - df_los['Mean_score2']",No,3,8.0
"df_win['result'] = 1
df_los['result'] = 0
data = pd.concat((df_win, df_los)).reset_index(drop=True)",No,4,11.0
"for col in ['Score_1', 'Score_2', 'Count_1', 'Count_2', 'Score_diff', 'Count_diff']:
    print(col)
    data[col] = data[col].fillna(0).astype(int)",No,5,17.0
"X = data.drop(['result'], axis=1)
y = data['result']",No,5,21.0
"# some of params are from this kernel: https://www.kaggle.com/ratan123/march-madness-2020-ncaam-simple-lightgbm-on-kfold
param = {'n_estimators':10000,
          'num_leaves': 400,
          'min_child_weight': 0.034,
          'feature_fraction': 0.379,
          'bagging_fraction': 0.418,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.007,
          ""boosting_type"": ""gbdt"",
          #""bagging_seed"": 11,
          ""metric"": 'binary_logloss',
          ""verbosity"": 10,
          'reg_alpha': 0.3899,
          'reg_lambda': 0.648,
          'random_state': 47,
          'task':'train', 'nthread':-1, 
         'verbose': 100,
         'early_stopping_rounds': 30,
         'eval_metric': 'binary_logloss'
         }
cat_cols = []
mt = MainTransformer(create_interactions=False)
# ct = CategoricalTransformer(drop_original=True, cat_cols=cat_cols)
ft = FeatureTransformer()
transformers = {'ft': ft}
lgb_model = ClassifierModel(model_wrapper=LGBWrapper())
lgb_model.fit(X=X, y=y, folds=folds, params=param, preprocesser=mt, transformers=transformers,
                    eval_metric='binary_logloss', cols_to_drop=None, plot=True)'",Yes,3,59.0
"test = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
test = test.drop(['Pred'], axis=1)
test['Season'] = test['ID'].apply(lambda x: int(x.split('_')[0]))
test['Team1'] = test['ID'].apply(lambda x: int(x.split('_')[1]))
test['Team2'] = test['ID'].apply(lambda x: int(x.split('_')[2]))
test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'WTeamID'])
test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'LTeamID'])
test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'LTeamID'])
test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'WTeamID'])
test['seed_diff'] = test['Seed_x'] - test['Seed_y']",Yes,2,45.0
"test['x_score'] = test['WScore_sum_x'] + test['LScore_sum_y']
test['y_score'] = test['WScore_sum_y'] + test['LScore_sum_x']
test['x_count'] = test['WScore_count_x'] + test['LScore_count_y']
test['y_count'] = test['WScore_count_y'] + test['WScore_count_x']",No,5,8.0
"test = test[['Seed_x', 'Seed_y', 'x_score', 'y_score', 'x_count', 'y_count']]
test.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']",No,4,10.0
"test['Seed_diff'] = test['Seed_1'] - test['Seed_2']
test['Score_diff'] = test['Score_1'] - test['Score_2']
test['Seed_diff'] = test['Seed_1'] - test['Seed_2']
test['Score_diff'] = test['Score_1'] - test['Score_2']

test['Count_diff'] = test['Count_1'] - test['Count_2']
test['Mean_score1'] = test['Score_1'] / test['Count_1']
test['Mean_score2'] = test['Score_2'] / test['Count_2']
test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2']
test['Count_diff'] = test['Count_1'] - test['Count_2']
test['Mean_score1'] = test['Score_1'] / test['Count_1']
test['Mean_score2'] = test['Score_2'] / test['Count_2']
test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2']",No,5,8.0
test_preds = lgb_model.predict(test),No,5,48.0
plt.hist(test_preds);,No,5,56.0
"submission_df = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
submission_df['Pred'] = test_preds
submission_df",Yes,5,45.0
"submission_df.to_csv('submission.csv', index=False)",No,5,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from itertools import cycle, islice
import seaborn as sb
import matplotlib.dates as dates
import datetime as dt

import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly import tools, subplots
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))",No,4,88.0
"train_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/train.csv"")#index_col=0
display(train_data.head())
test_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/test.csv"")#index_col=0
display(test_data.head())",Yes,4,45.0
"sum_df = pd.pivot_table(train_data, values=['ConfirmedCases','Fatalities'], index=['Date'],aggfunc=np.sum)
display(sum_df.max())",Yes,3,12.0
"def getColumnInfo(df):
    n_province =  df['Province_State'].nunique()
    n_country  =  df['Country_Region'].nunique()
    n_days     =  df['Date'].nunique()
    start_date =  df['Date'].unique()[0]
    end_date   =  df['Date'].unique()[-1]
    return n_province, n_country, n_days, start_date, end_date

n_train = train_data.shape[0]
n_test = test_data.shape[0]

n_prov_train, n_count_train, n_train_days, start_date_train, end_date_train = getColumnInfo(train_data)
n_prov_test,  n_count_test,  n_test_days,  start_date_test,  end_date_test  = getColumnInfo(test_data)

print ('<==Train data==> \
 # of Province_State: '+str(n_prov_train),', # of Country_Region:'+str(n_count_train), 
       ', Time Period: '+str(start_date_train)+' to '+str(end_date_train), '==> days:',str(n_train_days))
print(""\
 Countries with Province/State information:  "", train_data[train_data['Province_State'].isna()==False]['Country_Region'].unique())
print ('\
 <==Test  data==> \
 # of Province_State: '+str(n_prov_test),', # of Country_Region:'+str(n_count_test),
       ', Time Period: '+start_date_test+' to '+end_date_test, '==> days:',n_test_days)

df_test = test_data.loc[test_data.Date > '2020-04-14']
overlap_days = n_test_days - df_test.Date.nunique()
print('\
 overlap days with training data: ', overlap_days, ', total days: ', n_train_days+n_test_days-overlap_days)'",No,3,54.0
"prob_confirm_check_train = train_data.ConfirmedCases.value_counts(normalize=True)
prob_fatal_check_train = train_data.Fatalities.value_counts(normalize=True)

n_confirm_train = train_data.ConfirmedCases.value_counts()[1:].sum()
n_fatal_train = train_data.Fatalities.value_counts()[1:].sum()

print('Percentage of confirmed case records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_confirm_train, n_train, prob_confirm_check_train[1:].sum()*100))
print('Percentage of fatality records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_fatal_train, n_train, prob_fatal_check_train[1:].sum()*100))",No,3,72.0
"from itertools import cycle, islice
discrete_col = list(islice(cycle(['orange', 'r', 'g', 'k', 'b', 'c', 'm']), None, len(train_data_by_country_confirm.head(30))))
plt.rcParams.update({'font.size': 22})
train_data_by_country_confirm.head(20).plot(figsize=(20,15), kind='barh', color=discrete_col)
plt.legend([""Confirmed Cases"", ""Fatalities""]);
plt.xlabel(""Number of Covid-19 Affectees"")
plt.title(""First 20 Countries with Highest Confirmed Cases"")
ylocs, ylabs = plt.yticks()
for i, v in enumerate(train_data_by_country_confirm.head(20)[""ConfirmedCases""][:]):
    plt.text(v+0.01, ylocs[i]-0.25, str(int(v)), fontsize=12)
for i, v in enumerate(train_data_by_country_confirm.head(20)[""Fatalities""][:]):
    if v > 0: #disply for only >300 fatalities
        plt.text(v+0.01,ylocs[i]+0.1,str(int(v)),fontsize=12)    '",No,4,33.0
"def reformat_time(reformat, ax):
    ax.xaxis.set_major_locator(dates.WeekdayLocator())
    ax.xaxis.set_major_formatter(dates.DateFormatter('%b %d'))    
    if reformat: #reformat again if you wish
        date_list = train_data_by_date.reset_index()[""Date""].tolist()
        x_ticks = [dt.datetime.strftime(t,'%Y-%m-%d') for t in date_list]
        x_ticks = [tick for i,tick in enumerate(x_ticks) if i%8==0 ]# split labels into same number of ticks as by pandas
        ax.set_xticklabels(x_ticks, rotation=90)
    # cosmetics
    ax.yaxis.grid(linestyle='dotted')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.spines['left'].set_color('none')
    ax.spines['bottom'].set_color('none')

train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data_by_date = train_data.groupby(['Date'],as_index=True).agg({'ConfirmedCases': 'sum','Fatalities': 'sum', 
                                                                     'NewConfirmedCases':'sum', 'NewFatalities':'sum', 'MortalityRate':'mean'})
num0 = train_data_by_date._get_numeric_data() 
num0[num0 < 0.0] = 0.0
#display(train_data_by_date.head())

## ======= Sort by countries with fatalities > 600 ========

train_data_by_country_max = train_data.groupby(['Country_Region'],as_index=True).agg({'ConfirmedCases': 'max', 'Fatalities': 'max'})
train_data_by_country_fatal = train_data_by_country_max[train_data_by_country_max['Fatalities']>600]
train_data_by_country_fatal = train_data_by_country_fatal.sort_values(by=['Fatalities'],ascending=False).reset_index()
#display(train_data_by_country_fatal.head(20))

df_merge_by_country = pd.merge(train_data,train_data_by_country_fatal['Country_Region'],on=['Country_Region'],how='inner')
df_max_fatality_country = df_merge_by_country.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum',
                                                                                                     'Fatalities': 'sum',
                                                                                                     'NewConfirmedCases':'sum',
                                                                                                     'NewFatalities':'sum',
                                                                                                     'MortalityRate':'mean'})

num1 = df_max_fatality_country._get_numeric_data() 
num1[num1 < 0.0] = 0.0
df_max_fatality_country.set_index('Date',inplace=True)
#display(df_max_fatality_country.head(20))

countries = train_data_by_country_fatal['Country_Region'].unique()

plt.rcParams.update({'font.size': 16})

fig,(ax0,ax1) = plt.subplots(1,2,figsize=(15, 8))
fig,(ax2,ax3) = plt.subplots(1,2,figsize=(15, 8))#,sharey=True)

train_data_by_date.ConfirmedCases.plot(ax=ax0, x_compat=True, title='Confirmed Cases Globally', legend='Confirmed Cases',
                                       color=discrete_col)#, logy=True)
reformat_time(0,ax0)
train_data_by_date.NewConfirmedCases.plot(ax=ax0, x_compat=True, linestyle='dotted', legend='New Confirmed Cases',
                                          color=discrete_col)#, logy=True)
reformat_time(0,ax0)

train_data_by_date.Fatalities.plot(ax=ax2, x_compat=True, title='Fatalities Globally', legend='Fatalities', color='r')
reformat_time(0,ax2)
train_data_by_date.NewFatalities.plot(ax=ax2, x_compat=True, linestyle='dotted', legend='Daily Deaths',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax2)

for country in countries:
    match = df_max_fatality_country.Country_Region==country
    df_fatality_by_country = df_max_fatality_country[match] 
    df_fatality_by_country.ConfirmedCases.plot(ax=ax1, x_compat=True, title='Confirmed Cases Nationally')
    reformat_time(0,ax1)
    df_fatality_by_country.Fatalities.plot(ax=ax3, x_compat=True, title='Fatalities Nationally')
    reformat_time(0,ax3)
    
#ax1.legend(countries)
#ax3.legend(countries)
ax1.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
ax3.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
'",No,4,75.0
"fig = plt.figure()
fig,(ax4,ax5) = plt.subplots(1,2,figsize=(20, 8))
#train_data_by_date.loc[(train_data_by_date.ConfirmedCases > 200)]#useless, its already summed.
train_data_by_date.MortalityRate.plot(ax=ax4, x_compat=True, legend='Mortality Rate',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax4)

for num, country in enumerate(countries):
    match = df_max_fatality_country.Country_Region==country 
    df_fatality_by_country = df_max_fatality_country[match] 
    df_fatality_by_country.MortalityRate.plot(ax=ax5, x_compat=True, title='Average Mortality Rate Nationally')    
    reformat_time(0,ax5)

ax5.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))",No,4,75.0
"train_data_by_max_date = train_data_by_country.query('(Date == @max_train_date) & (ConfirmedCases > 100)')
train_data_by_max_date.loc[:, 'MortalityRate'] = train_data_by_max_date.loc[:,'Fatalities']/train_data_by_max_date.loc[:,'ConfirmedCases']
train_data_by_mortality = train_data_by_max_date.sort_values('MortalityRate', ascending=False)
train_data_by_mortality.set_index('Country_Region', inplace=True)
#display(train_data_by_mortality.head())

palette = plt.get_cmap('OrRd_r')
rainbow_col = [palette(1.*i/20.0) for i in range(20)]

train_data_by_mortality.MortalityRate.head(20).plot(figsize=(15,10), kind='barh', color=rainbow_col)
plt.xlabel(""Mortality Rate"")
plt.title(""First 20 Countries with Highest Mortality Rate"")
ylocs, ylabs = plt.yticks()

'",No,5,33.0
"#import plotly.io as pio              # to set shahin plot layout

world_df = train_data_by_country.query('Date == @max_train_date')
world_df.loc[:,'Date']           = world_df.loc[:,'Date'].apply(str)
world_df.loc[:,'Confirmed_log']  = round(np.log10(world_df.loc[:,'ConfirmedCases'] + 1), 3)
world_df.loc[:,'Fatalities_log'] = np.log10(world_df.loc[:,'Fatalities'] + 1)
world_df.loc[:,'MortalityRate']  = round(world_df.loc[:, 'Fatalities'] / world_df.loc[:,'ConfirmedCases'], 3)
world_df.loc[:,'GrowthFactor']  = round(world_df.loc[:,'GrowthRate'], 3)
#display(world_df.head())

fig1 = px.choropleth(world_df, locations=""Country_Region"", 
                    locationmode=""country names"",  
                    color=""Confirmed_log"",                     
                    hover_name=""Country_Region"",
                    hover_data=['ConfirmedCases', 'Fatalities', 'MortalityRate', 'GrowthFactor'],
                    range_color=[world_df['Confirmed_log'].min(), world_df['Confirmed_log'].max()], 
                    color_continuous_scale = px.colors.sequential.Plasma,
                    title='COVID-19: Confirmed Cases')
fig1.show()
'",No,5,84.0
"fig2 = px.scatter_geo(world_df, 
                     locations=""Country_Region"", 
                     locationmode=""country names"", 
                     color=""ConfirmedCases"", size='ConfirmedCases', 
                     hover_name=""Country_Region"", 
                     hover_data=['ConfirmedCases', 'Fatalities', 'MortalityRate', 'GrowthFactor'],
                     range_color= [world_df['Confirmed_log'].min(), world_df['ConfirmedCases'].max()], 
                     projection=""natural earth"", 
                     animation_frame=""Date"",
                     animation_group=""Country_Region"",
                     color_continuous_scale=""portland"",
                     title='COVID-19: Spread Over Time')

#fig2.layout.updatemenus[0].buttons[0].args[1][""frame""][""duration""] = 10
#fig2.layout.updatemenus[0].buttons[0].args[1][""transition""][""duration""] = 10
fig2.layout.coloraxis.showscale = False
#fig2.layout.sliders[0].pad.t = 10
#fig2.layout.updatemenus[0].pad.t= 10
fig2.show()'",No,5,84.0
"from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

plt.rcParams.update({'font.size': 12})
fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20, 8))
countries_europe = ['Italy', 'France', 'Spain', 'Germany', 'United Kingdom']

# Take the 1st day as 2020-02-23
df = train_data.loc[train_data.Date >= '2020-02-23']
n_days_europe = df.Date.nunique()
rainbow_col= plt.cm.jet(np.linspace(0,1,len(countries)))

for country, c in tqdm(zip(countries,rainbow_col)): 
    df_country_train = df_max_fatality_country[df_max_fatality_country['Country_Region']==country] 
    df_country_test = test_data[test_data['Country_Region']==country]  
    df_country_train = df_country_train.reset_index()[df_country_train.reset_index().Date > '2020-02-22']
    n_days_sans_China = df.Date.nunique() - df_country_train.Date.nunique() 
    
    x_train = np.arange(1, n_days_europe+1).reshape((-1,1))
    x_test  = (np.arange(1,n_days_europe+n_test_days+1-overlap_days)).reshape((-1,1)) 
    y_train_f = df_country_train['Fatalities']
    #print (x_train, y_train_f)
    model_f = make_pipeline(PolynomialFeatures(degree=3), Ridge(fit_intercept=False)) 
    model_f = model_f.fit(x_train, y_train_f)
    y_predict_f = model_f.predict(x_test) 
    #print (x_test[-n_test_days:], y_predict_f[-n_test_days:])
    y_train_c = df_country_train['ConfirmedCases'] 
    model_c = make_pipeline(PolynomialFeatures(degree=3), Ridge(fit_intercept=False)) 
    model_c = model_c.fit(x_train, y_train_c)
    y_predict_c = model_c.predict(x_test)
    
    extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)]
    x_test      = np.append(x_test, extend_days_test) 
    y_predict_c = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant')
    y_predict_f = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant')
    
    ax0.plot(x_test[-n_test_days:], y_predict_c[-n_test_days:],linewidth=2, label='predict_'+country, color=c)
    ax0.plot(x_train, y_train_c, linewidth=2, color=c, linestyle='dotted', label='train_'+country)
    ax0.set_title(""Prediction vs Training for Confirmed Cases"")
    ax0.set_xlabel(""Number of days"")
    ax0.set_ylabel(""Confirmed Cases"")
    #ax0.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))
    #ax0.set_yscale('log')
    
    ax1.plot(x_test[-(n_test_days):], y_predict_f[-(n_test_days):],linewidth=2, label='predict_'+country, color=c)
    ax1.plot(x_train, y_train_f, linewidth=2, color=c, linestyle='dotted', label='train_'+country)
    ax1.set_title(""Prediction vs Training for Fatalities"")
    ax1.set_xlabel(""Number of days"")
    ax1.set_ylabel(""Fatalities"")
    ax1.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))
    #ax1.set_yscale('log')'",Yes,5,56.0
"from scipy.optimize.minpack import curve_fit
from sklearn.metrics import r2_score
from scipy.special import expit

def Gompertz(a, c, t, t0):    
    Q = a * np.exp(-np.exp(-c*(t-t0)))
    return Q
def Boltzman(a, c, t, t0):
    Q = a / (1 + np.exp(-c*(t-t0)))
    return Q
emerging_countries = ['Albania', 'Andorra', 'Argentina', 'Armenia', 'Azerbaijan', 'Bahrain', 
                      'Barbados', 'Bhtan', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Chile', 
                      'Colombia', 'Congo (Kinshasa)', 'Costa Rica', 'Cote dIvoire', 'Croatia', 
                      'Cuba', 'Cyprus', 'Czechia', 'Dominican Republic', 'Egypt', 'Estonia', 
                      'Georgia', 'Greece', 'Honduras', 'Iceland', 'Iraq', 'Israel', 'Jamaica', 
                      'Japan', 'Jordan', 'Kuwait', 'Latvia', 'Lebanon', 'Lithuania', 
                      'Luxembourg', 'Malaysia', 'Maldives', 'Malta', 'Mauritania', 'Mauritius', 'Monaco',
                      'Mongolia', 'Montenegro', 'Morocco', 'Namibia', 'Nigeria', 'North Macedonia', 
                      'Norway', 'Oman', 'Panama','Paraguay', 'Rawanda', 'Saint Lucia', 'San Marino', 
                      'Senegal', 'Seychelles', 'Singapore','Slovakia', 'Slovenia', 'Sri Lanka', 'Thailand', 
                      'Tunisia', 'Uganda', 'Uruguay', 'Venezuela']
def get_bounds_fatal (country, isState, y_train):
    x = ''
    for c in emerging_countries:
        if country == c: 
            x = c; break
    maximum = max(y_train)
    if maximum == 0.0: maximum = 1.0         
    if country == 'China':
        lower = [0, 0.02, 0]
        upper = [2.0*maximum,0.16, 40]
    elif country == 'Iran':
        lower = [0, 0.00, 0]
        upper = [3.0*maximum,0.11, 68]
    elif country == 'Italy':
        lower = [0, 0.00, 0]
        upper = [3.0*maximum,0.13, 72]       
    elif country == 'US':
        lower = [0, 0.02, 0]
        if maximum <=10:upper = [4.0*maximum, 0.30, 85] 
        else:           upper = [3.5*maximum, 0.20, 90] 
    elif country == 'France':
        lower = [0, 0.02, 0]
        if maximum <=10:upper = [4.0*maximum,0.18, 80]
        else:           upper = [4.0*maximum,0.15, 90] 
    elif country == 'Spain':
        lower = [0, 0.02, 0]
        upper = [3.0*maximum,0.15, 78]
    elif country == 'Germany':
        lower = [0.0, 0.02, 0]
        upper = [3.0*maximum,0.20, 85] 
    elif country == 'Belgium':
        lower = [0.0, 0.02, 0]
        upper = [3.0*maximum,0.25, 88] 
    elif country == 'Turkey':
        lower = [0.0, 0.02, 0]
        upper = [3.5*maximum,0.22, 90]
    elif country == 'Netherlands':
        lower = [0.0, 0.02, 0]
        upper = [4.0*maximum,0.14, 88] 
    elif country == 'Switzerland':
        lower = [0.0, 0.02, 0]
        upper = [4.0*maximum,0.12, 90] 
    elif country == 'United Kingdom':
        lower = [0.0, 0.02, 0]
        upper = [4.5*maximum,0.16, 95]
    elif country == 'Portugal':
        lower = [100, 0.02, 0]
        upper = [4.5*maximum,0.12, 95]  
    elif country == 'Sweden':
        lower = [100, 0.02, 0]
        upper = [4.0*maximum,0.18, 90] 
    elif country == 'Brazil':
        lower = [100, 0.02, 0]
        upper = [3.5*maximum,0.20, 90] 
    elif country == 'Indonesia':
        lower = [100, 0.02, 0]
        upper = [4.5*maximum,0.10, 95]  
    elif country == 'Austria':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.10, 95]  
    elif country == 'Ireland':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.15, 95]          
    elif country == 'Canada':
        lower = [0, 0.02, 0]
        if maximum <=10: upper = [2.0*maximum, 0.20, 65] 
        else:            upper = [4.5*maximum, 0.16, 95]     
    elif country == 'India':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.20, 95]  
    elif country == 'Ecuador':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.16, 96]  
    elif country == 'Romania':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.15, 95]  
    elif country == 'Philippines':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.12, 95]    
    elif country == 'Algeria':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.12, 95]     
    elif country == 'Mexico':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.20, 95]       
    elif country == 'Denmark':
        lower = [0, 0.02, 0]
        if maximum <=10:upper = [4.0*maximum, 0.30, 80] 
        else:           upper = [4.5*maximum,0.12, 94]      
    elif country == 'Poland':
        lower = [0, 0.02, 0]
        upper = [4.0*maximum,0.20, 94]  
    elif country == 'Korea, South':
        lower = [0, 0.02, 0]
        upper = [2.5*maximum,0.10, 52] 
    elif country == 'Peru':
        lower = [0.0, 0.02, 0]
        upper = [4.5*maximum,0.18, 95] 
    elif country == 'Australia':
        lower = [0, 0.02, 0]
        if maximum <=10: upper = [2.0*maximum, 0.20, 45] 
        else:            upper = [2.5*maximum,0.20, 70]
    elif country == 'Pakistan':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.12,95]
    elif country == 'Saudi Arabia':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.15,95]     
    elif country == 'Afghanistan':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.12,95]
    elif country == 'Diamond Princess':
        lower = [0.0, 0.02, 0] 
        upper = [1.0*maximum,0.50,2] 
    elif country == 'Hungary':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.14,94]
    elif country == 'New Zealand':
        lower = [0.0, 0.02, 0] 
        upper = [4.0*maximum,0.14,90]
    elif country == 'Somalia':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.10,94] 
    elif country == x:
        lower = [0.0, 0.02, 0] 
        upper = [3.5*maximum,0.15,85]  
    else:
        lower = [0.0, 0.02, 0] 
        if isState:
            if maximum <=10:upper = [4.0*maximum,0.30,80] 
            else:           upper = [4.5*maximum,0.15,80]
        else: 
            if maximum <=10:upper = [4.0*maximum,0.60,85] 
            else:           upper = [4.5*maximum,0.18,95]  
                
    return lower, upper

def get_bounds_confirm (country, isState, y_train):
    x = ''
    for c in emerging_countries:
        if country == c: 
            x = c; break
    maximum = max(y_train)
    if maximum == 0.0: maximum = 1.0        
    if country == 'China':
        lower = [0, 0.02, 0]
        upper = [2.0*maximum,0.20,30]
    elif country == 'Iran':
        lower = [0, 0.00, 0]
        upper = [3.0*maximum,0.12,70]
    elif country == 'Italy':
        lower = [0, 0.00, 0]
        upper = [3.0*maximum,0.12, 70]
    elif country == 'US':
        lower = [0, 0.02, 0]
        if maximum <=10:upper = [4.0*maximum, 0.30, 80] 
        else:           upper = [3.0*maximum, 0.18, 85]     
    elif country == 'France':
        lower = [0, 0.02, 0]
        if maximum <=10:upper = [4.0*maximum, 0.15, 80] 
        else:           upper = [4.5*maximum, 0.10, 90]             
    elif country == 'Spain':
        lower = [0, 0.02, 0]
        upper = [3.0*maximum,0.13, 75] 
    elif country == 'Germany':
        lower = [0, 0.02, 0]
        upper = [3.0*maximum,0.13, 75] 
    elif country == 'Belgium':
        lower = [0, 0.02, 0]
        upper = [3.0*maximum,0.15, 78]
    elif country == 'Turkey':
        lower = [0, 0.02, 0]
        upper = [3.5*maximum,0.20, 90] 
    elif country == 'Netherlands':
        lower = [0, 0.02, 0]
        upper = [4.0*maximum,0.10, 88] 
    elif country == 'Switzerland':
        lower = [0, 0.02, 0]
        upper = [3.5*maximum,0.10, 75]  
    elif country == 'United Kingdom':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.12, 95] 
    elif country == 'Portugal':
        lower = [0, 0.02, 0]
        upper = [4.0*maximum,0.11, 88]   
    elif country == 'Sweden':
        lower = [0, 0.02, 0]
        upper = [4.0*maximum,0.10, 88]    
    elif country == 'Brazil':
        lower = [0, 0.02, 0]
        upper = [3.5*maximum,0.18, 88]  
    elif country == 'Indonesia':
        lower = [0, 0.02, 0]
        upper = [5.5*maximum,0.09, 100] 
    elif country == 'Austria':
        lower = [0, 0.02, 0]
        upper = [3.5*maximum,0.12, 75] 
    elif country == 'Ireland':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.12, 95]          
    elif country == 'Canada':
        lower = [0, 0.02, 0]
        if maximum <=10: upper = [3.0*maximum, 0.28, 75] 
        else:            upper = [4.5*maximum, 0.12, 93]            
    elif country == 'India':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.16, 96] 
    elif country == 'Ecuador':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.20, 95] 
    elif country == 'Romania':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.11, 93]  
    elif country == 'Philippines':
        lower = [0, 0.02, 0]
        upper = [5.5*maximum,0.12, 95]  
    elif country == 'Algeria':
        lower = [0, 0.02, 0]
        upper = [5.5*maximum,0.10, 98] 
    elif country == 'Mexico':
        lower = [100, 0.02, 0]
        upper = [4.5*maximum,0.15, 95]        
    elif country == 'Denmark':
        lower = [0, 0.02, 0]
        if isState:
            if maximum <= 10: upper = [2.0*maximum,0.20,80] 
            else:             upper = [2.5*maximum,0.25, 55]    
        else:
            if maximum <=10: upper = [2.0*maximum,0.30, 40] 
            else:            upper = [5.5*maximum,0.06, 100]       
    elif country == 'Poland':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.11, 94]
    elif country == 'Korea, South':
        lower = [0, 0.02, 0]
        upper = [2.0*maximum,0.25, 18] 
    elif country == 'Peru':
        lower = [0, 0.02, 0]
        upper = [4.5*maximum,0.20, 96] 
    elif country == 'Australia':
        lower = [0, 0.02, 0]
        if maximum <=10: upper = [2.0*maximum, 0.25, 45] 
        else:            upper = [2.5*maximum,0.18, 65] 
    elif country == 'Pakistan':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.10,94]
    elif country == 'Saudi Arabia':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.10,94]    
    elif country == 'Afghanistan':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.12,94]
    elif country == 'Diamond Princess':
        lower = [0.0, 0.02, 0] 
        upper = [1.0*maximum,1.0,1.0]
    elif country == 'Hungary':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.10,94]  
    elif country == 'New Zealand':
        lower = [0.0, 0.02, 0] 
        upper = [4.5*maximum,0.15,85] 
    elif country == 'Somalia':
        lower = [0.0, 0.02, 0] 
        upper = [1.0*maximum,0.08,50] 
    elif country == x:
        lower = [0.0, 0.02, 0] 
        upper = [3.5*maximum,0.10,80]  
    else:
        lower = [0.0, 0.02, 0] 
        if isState:
            if maximum <= 200: upper = [2.0*maximum,0.20,80] 
            else:              upper = [4.5*maximum,0.20,80]
        else:  
            if maximum <= 200: upper = [3.0*maximum,0.20,85]  
            else:              upper = [4.5*maximum,0.20,96]    
                
    return lower, upper 

plt.rcParams.update({'font.size': 12})
fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20, 8))
fig,(ax2,ax3) = plt.subplots(1,2,figsize=(20, 8))

rainbow_col= plt.cm.jet(np.linspace(0,1,len(countries)))

for country, c in tqdm(zip(countries,rainbow_col)): 
    #print('\
\
\
\
 country ==>', country)
    df_country_train = df_max_fatality_country[df_max_fatality_country['Country_Region']==country] 
    df_country_test = test_data[test_data['Country_Region']==country]  
    if country != 'China':
        df_country_train = df_country_train.reset_index().loc[df_country_train.reset_index().Date>'2020-02-22'] #17
        n_days_sans_China =train_data.Date.nunique() - df_country_train.Date.nunique()        
    else:
        df_country_train = df_country_train.reset_index()
        n_days_sans_China = 0
        
    n_train_days =df_country_train.Date.nunique()    
    x_train = range(n_train_days)
    x_test  = range(n_train_days+n_test_days-overlap_days)#n_test_days+overlap_days)
    y_train_f = df_country_train['Fatalities']
    y_train_c = df_country_train['ConfirmedCases'] 
    y_train_cn = (df_country_train['ConfirmedCases'] - df_country_train['ConfirmedCases'].shift(1)).fillna(0.0).replace([-np.inf, np.inf],  0.0)
    y_train_fn = (df_country_train['Fatalities'] - df_country_train['Fatalities'].shift(1)).fillna(0.0).replace([-np.inf, np.inf],  0.0)
    
    ###### Fatalities:   
    lower, upper = get_bounds_fatal (country, 0, y_train_f)
    popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper))
    a_max, estimated_c, estimated_t0 = popt_f
    y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0)
    y_predict_f_at_t0 =  Gompertz(a_max, estimated_c, estimated_t0, estimated_t0)
    #print('\
fatalities ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', 
    #      estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:n_train_days]))
    y_fn = np.array([])
    fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))]    
    y_predict_fn = np.append(y_fn, fn)
   
    ###### Confirmed cases:    
    lower_c,upper_c = get_bounds_confirm (country, 0, y_train_c)
    popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c))
    a_max_c, estimated_c_c, estimated_t0_c = popt_c
    y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c)
    y_predict_c_at_t0 =  Gompertz(a_max_c, estimated_c_c, estimated_t0_c, estimated_t0_c)
    #print('confirmed ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', 
    #      estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:n_train_days]))
    y_cn = np.array([])
    cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))]    
    y_predict_cn = np.append(y_cn, cn)
       
    ## ===== Move the x-axis of trained and test datasets to allign with dates in China ======
    extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)]
    x_test       = np.append(x_test, extend_days_test) 
    y_predict_c  = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant')
    y_predict_cn = np.pad(y_predict_cn,(n_days_sans_China, 0), 'constant')
    y_predict_f  = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant')
    y_predict_fn = np.pad(y_predict_fn, (n_days_sans_China, 0), 'constant')
    inflection_c = estimated_t0_c+n_days_sans_China

    extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)]
    x_train      = np.append(x_train, extend_days_train)
    y_train_c    = np.pad(y_train_c, (n_days_sans_China, 0), 'constant')
    y_train_cn   = np.pad(y_train_cn, (n_days_sans_China, 0), 'constant')
    y_train_f    = np.pad(y_train_f, (n_days_sans_China, 0), 'constant')
    y_train_fn  = np.pad(y_train_fn, (n_days_sans_China, 0), 'constant')
    inflection_f = estimated_t0+n_days_sans_China
    
    ## ===== Plot =======
    ax0.plot(x_test, y_predict_c, linewidth=2, label=country, color=c) 
    ax0.plot(inflection_c, y_predict_c_at_t0, marker='o', markersize=6, color='green')#, label='inflection')
    ax0.plot(x_train, y_train_c, linewidth=2, color=c,linestyle='dotted')#, label='train_'+country)   
    ax0.set_title(""Total Confirmed Cases"")
    ax0.set_xlabel(""Number of days"")
    ax0.set_ylabel(""Confirmed Cases"")
    ax0.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))
    
    ax1.plot(x_test, y_predict_f, linewidth=2, label=country,color=c) 
    ax1.plot(inflection_f, y_predict_f_at_t0, marker='o', markersize=6, color='green')
    ax1.plot(x_train, y_train_f, linewidth=2,color=c, linestyle='dotted')#, label='train_'+country)    
    ax1.set_title(""Total Fatalities"")
    ax1.set_xlabel(""Number of days"")
    ax1.set_ylabel(""Fatalities"")
    ax1.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))
    
    ax2.plot(x_test, y_predict_cn, linewidth=2, label=country, color=c) 
    ax2.scatter(x_train, y_train_cn, linewidth=2, color=c, linestyle='dotted')#, label='train_'+country)   
    ax2.set_title(""New Confirmed Cases"")
    ax2.set_xlabel(""Number of days"")
    ax2.set_ylabel(""New Confirmed Cases"")
    ax2.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))
    
    ax3.plot(x_test, y_predict_fn, linewidth=2, label=country, color=c) 
    ax3.scatter(x_train, y_train_fn, linewidth=2, color=c, linestyle='dotted')#, label='train_'+country)   
    ax3.set_title(""New Fatalities"")
    ax3.set_xlabel(""Number of days"")
    ax3.set_ylabel(""New Fatalities"")
    ax3.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))'",Yes,3,33.0
"nCountries= train_data['Country_Region'].unique() 
isState = bool
x_train = range(n_train_days)
x_test  = range(n_train_days+n_test_days-overlap_days)

for country in tqdm(nCountries): 
    fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20,8))
    fig,(ax2,ax3) = plt.subplots(1,2,figsize=(20,8))
    #print('\
\
\
\
 country ==>', country) 
    
    df_country_train = train_data[train_data['Country_Region']==country] 
    df_country_test = test_data[test_data['Country_Region']==country]  
    
    if country != 'China':
        df_country_train = df_country_train.reset_index().loc[df_country_train.reset_index().Date>'2020-02-22'] #17
        n_days_sans_China =train_data.Date.nunique() - df_country_train.Date.nunique()        
    else:
        df_country_train = df_country_train.reset_index()
        n_days_sans_China = 0
        
    n_train_days =df_country_train.Date.nunique()    
    x_train = range(n_train_days)
    x_test  = range(n_train_days+n_test_days-overlap_days)   
    nvalues = df_country_train['Province_State'].isna().nunique() #fix for problem with Denmark data
    
    if (df_country_train['Province_State'].isna().unique()==True).any() and nvalues<2: 
        isState = False        
        y_train_f = df_country_train['Fatalities']
        y_train_c = df_country_train['ConfirmedCases']  
        y_train_cn = (df_country_train['ConfirmedCases'] - df_country_train['ConfirmedCases'].shift(1)).fillna(0.0)
        y_train_fn = (df_country_train['Fatalities'] - df_country_train['Fatalities'].shift(1)).fillna(0.0)
        
        if y_train_f.empty == False:
            lower, upper = get_bounds_fatal (country, isState, y_train_f)
            #print(lower, upper)
            popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper))
            a_max, estimated_c, estimated_t0 = popt_f
            y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0)            
            #print('\
fatalities ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', 
             #     estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:n_train_days]))
            y_fn = np.array([])
            fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))]    
            y_predict_fn = np.append(y_fn, fn)
   
            
        if y_train_c.empty == False:  
            lower_c, upper_c = get_bounds_confirm (country, isState, y_train_c)
            #print(lower_c, upper_c)
            popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c))
            a_max_c, estimated_c_c, estimated_t0_c = popt_c
            y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c)
            #print('\
confirmed ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', 
             #     estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:n_train_days]))
            y_cn = np.array([])
            cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))]    
            y_predict_cn = np.append(y_cn, cn)
            
        ## ===== Move the x-axis of trained and test datasets to allign with dates in China ======
        extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)]
        x_test       = np.append(x_test, extend_days_test)                         
        y_predict_c  = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant')
        y_predict_cn = np.pad(y_predict_cn,(n_days_sans_China, 0), 'constant')
        y_predict_f  = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant')
        inflection_f = estimated_t0+n_days_sans_China
        y_predict_fn = np.pad(y_predict_fn, (n_days_sans_China, 0), 'constant')
            
        extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)]
        x_train      = np.append(x_train, extend_days_train)           
        y_train_c    = np.pad(y_train_c, (n_days_sans_China, 0), 'constant')
        y_train_cn   = np.pad(y_train_cn, (n_days_sans_China, 0), 'constant')
        y_train_f    = np.pad(y_train_f, (n_days_sans_China, 0), 'constant')
        y_train_fn   = np.pad(y_train_fn, (n_days_sans_China, 0), 'constant')
        inflection_c = estimated_t0_c+n_days_sans_China           
        
        ax0.plot(x_test, y_predict_c, linewidth=2, label='predict_'+country) 
        ax0.plot(x_train, y_train_c, linewidth=2, color='r', linestyle='dotted', label='train_'+country)
        ax0.set_title(""Prediction vs Training for Confirmed Cases"")
        ax0.set_xlabel(""Number of days"")
        ax0.set_ylabel(""Confirmed Cases"")
        ax0.legend()
        test_data.loc[test_data['Country_Region']==country,'ConfirmedCases'] = y_predict_c[-n_test_days:]
        
        ax1.plot(x_test, y_predict_f, linewidth=2, label='predict_'+country) 
        ax1.plot(x_train, y_train_f, linewidth=2, color='r', linestyle='dotted', label='train_'+country)    
        ax1.set_title(""Prediction vs Training for Fatalities"")
        ax1.set_xlabel(""Number of days"")
        ax1.set_ylabel(""Fatalities"")
        ax1.legend()
        test_data.loc[test_data['Country_Region']==country,'Fatalities'] = y_predict_f[-n_test_days:] 
                
        ax2.plot(x_test, y_predict_cn, linewidth=2, label='predict_'+country) 
        ax2.scatter(x_train, y_train_cn, linewidth=2, color='r', linestyle='dotted', label='train_'+country)   
        ax2.set_title(""New Confirmed Cases"")
        ax2.set_xlabel(""Number of days"")
        ax2.set_ylabel(""New Confirmed Cases"")
        ax2.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))
    
        ax3.plot(x_test, y_predict_fn, linewidth=2, label='predict_'+country) 
        ax3.scatter(x_train, y_train_fn, linewidth=2, color='r', linestyle='dotted', label='train_'+country)   
        ax3.set_title(""New Fatalities"")
        ax3.set_xlabel(""Number of days"")
        ax3.set_ylabel(""New Fatalities"")
        ax3.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))
    
    else: # use Province/State data when available
        isState = True
        state_list = []
        y_predict_c_dict = {}; y_train_c_dict = {}
        y_predict_cn_dict = {}; y_train_cn_dict = {}
        y_predict_f_dict = {}; y_train_f_dict = {}
        y_predict_fn_dict = {}; y_train_fn_dict = {}
        for state in df_country_train['Province_State'].unique():
            df_state_train = df_country_train[df_country_train['Province_State']==state] #state
            df_state_test = df_country_test[df_country_test['Province_State']==state]   
            state_list.append(state)
            y_train_f = df_state_train['Fatalities']
            y_train_c = df_state_train['ConfirmedCases']  
            y_train_cn = (df_state_train['ConfirmedCases'] - df_state_train['ConfirmedCases'].shift(1)).fillna(0.0)
            y_train_fn = (df_state_train['Fatalities'] - df_state_train['Fatalities'].shift(1)).fillna(0.0)
            
            if y_train_f.empty== False:                 
                lower, upper = get_bounds_fatal (country, isState, y_train_f)
                popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper))
                a_max, estimated_c, estimated_t0 = popt_f
                y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0) 
                y_predict_f_dict[state] =  y_predict_f
                y_train_f_dict[state]   =  y_train_f                
                #print('\
fatalities state ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', 
                #    estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:70]))
                y_fn = np.array([])
                fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))]    
                y_predict_fn = np.append(y_fn, fn)
                y_predict_fn_dict[state] = y_predict_fn
                y_train_fn_dict[state]   = y_train_fn
                                
            if y_train_c.empty == False:  
                lower_c, upper_c = get_bounds_confirm (country, isState, y_train_c)
                popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c))
                a_max_c, estimated_c_c, estimated_t0_c = popt_c
                y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c)
                y_predict_c_dict[state] =  y_predict_c
                y_train_c_dict[state]   =  y_train_c
                #print('\
confirmed state ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', 
                #  estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:70]))                
                y_cn = np.array([])
                cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))]    
                y_predict_cn = np.append(y_cn, cn)
                y_predict_cn_dict[state] = y_predict_cn
                y_train_cn_dict[state]   = y_train_cn
                            
        ## ====== Plot and Store the Results: ======
        ## ====== Move the x-axis of trained and test datasets to allign with dates in China ======       
        extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)]
        x_test      = np.append(x_test, extend_days_test) 
        extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)]
        x_train     = np.append(x_train, extend_days_train)           
            
        for state, y_predict in y_predict_f_dict.items():
            y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') 
            ax1.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) 
            ax1.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) 
            test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'Fatalities'] = y_predict[-n_test_days:]
        for state, y_train in y_train_f_dict.items():
            y_train   = np.pad(y_train, (n_days_sans_China, 0), 'constant')
            ax1.plot(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+state)             
        ax1.set_title(""Prediction vs Training for Fatalities"")
        ax1.set_xlabel(""Number of days"")
        ax1.set_ylabel(""Fatalities"")   
        
        
        for state, y_predict in y_predict_c_dict.items():
            y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') 
            ax0.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) 
            #ax0.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) 
            test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'ConfirmedCases'] = y_predict[-n_test_days:]
        for state, y_train in y_train_c_dict.items():
            y_train   = np.pad(y_train, (n_days_sans_China, 0), 'constant')
            ax0.plot(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+country+'_'+state)             
        ax0.set_title(""Prediction vs Training for ConfirmedCases"")
        ax0.set_xlabel(""Number of days"")
        ax0.set_ylabel(""Confirmed Cases"") 
        
        for state, y_predict in y_predict_fn_dict.items():
            y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') 
            ax3.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) 
            ax3.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) 
        for state, y_train in y_train_fn_dict.items():
            y_train   = np.pad(y_train, (n_days_sans_China, 0), 'constant')
            ax3.scatter(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+state)    
        ax3.set_title(""New Fatalities"")
        ax3.set_xlabel(""Number of days"")
        ax3.set_ylabel(""New Fatalities"")
        
        
        for state, y_predict in y_predict_cn_dict.items():
            y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') 
            ax2.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) 
            #ax2.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) 
            test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'ConfirmedCases'] = y_predict[-n_test_days:]
        for state, y_train in y_train_cn_dict.items():
            y_train   = np.pad(y_train, (n_days_sans_China, 0), 'constant')
            ax2.scatter(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+country+'_'+state)
        ax2.set_title(""New Confirmed Cases"")
        ax2.set_xlabel(""Number of days"")
        ax2.set_ylabel(""New Confirmed Cases"")'",Yes,3,56.0
"submit_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/submission.csv"")#, index_col=0)

test_data['Fatalities'] = test_data['Fatalities'].fillna(0.0).astype(int)
test_data['ConfirmedCases'] = test_data['ConfirmedCases'].fillna(0.0).astype(int)

submit_data['Fatalities'] = test_data['Fatalities'].astype('int')
submit_data['ConfirmedCases'] = test_data['ConfirmedCases'].astype('int')

submit_data.to_csv('submission.csv', index=False)
submit_data.head()'",Yes,4,25.0
display(submit_data.describe()),No,5,40.0
"import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../input/44352/training_solutions_rev1.csv')

df_train, df_test = train_test_split(df, test_size=.2)
df_train.shape, df_test.shape",No,3,45.0
"from skimage.transform import resize
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
          

ORIG_SHAPE = (424,424)
CROP_SIZE = (256,256)
IMG_SHAPE = (64,64)

def get_image(path, x1,y1, shape, crop_size):
    x = plt.imread(path)
    x = x[x1:x1+crop_size[0], y1:y1+crop_size[1]]
    x = resize(x, shape)
    x = x/255.
    return x
    
def get_all_images(dataframe, shape=IMG_SHAPE, crop_size=CROP_SIZE):
    x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
    y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2
   
    sel = dataframe.values
    ids = sel[:,0].astype(int).astype(str)
    y_batch = sel[:,1:]
    x_batch = []
    for i in tqdm(ids):
        x = get_image('../input/44352/images_training_rev1/'+i+'.jpg', x1,y1, shape=shape, crop_size=crop_size)
        x_batch.append(x)
    x_batch = np.array(x_batch)
    return x_batch, y_batch
        
X_train, y_train = get_all_images(df_train)
X_test, y_test = get_all_images(df_test)",Yes,3,44.0
"from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, GlobalMaxPooling2D
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

model = Sequential()
model.add(Conv2D(512, (3, 3), input_shape=(IMG_SHAPE[0], IMG_SHAPE[1], 3)))
model.add(Conv2D(256, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(256, (3, 3)))
model.add(Conv2D(128, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3)))
model.add(Conv2D(128, (3, 3)))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(GlobalMaxPooling2D())


model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(37))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=[root_mean_squared_error])
model.summary()",Yes,5,4.0
"batch_size = 128
model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test))",No,5,7.0
"import os
from tqdm import tqdm

def test_image_generator(ids, shape=IMG_SHAPE):
    x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
    y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2
    x_batch = []
    for i in ids:
        x = get_image('../input/44352/images_test_rev1/'+i, x1, y1, shape=IMG_SHAPE, crop_size=CROP_SIZE)
        x_batch.append(x)
    x_batch = np.array(x_batch)
    return x_batch

val_files = os.listdir('../input/44352/images_test_rev1/')
val_predictions = []
N_val = len(val_files)
for i in tqdm(np.arange(0, N_val, batch_size)):
    if i+batch_size > N_val:
        upper = N_val
    else:
        upper = i+batch_size
    X = test_image_generator(val_files[i:upper])
    y_pred = model.predict(X)
    val_predictions.append(y_pred)
val_predictions = np.array(val_predictions)
Y_pred = np.vstack(val_predictions)
ids = np.array([v.split('.')[0] for v in val_files]).reshape(len(val_files),1)
submission_df = pd.DataFrame(np.hstack((ids, Y_pred)), columns=df.columns)
submission_df = submission_df.sort_values(by=['GalaxyID'])
submission_df.to_csv('sample_submission.csv', index=False)",Yes,2,48.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from catboost.datasets import amazon
train, test = amazon()
print(train.shape, test.shape)
target = ""ACTION""
col4train = [x for x in train.columns if x not in [target, ""ROLE_TITLE""]]
y = train[target].values",No,3,58.0
"from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

# returns model instance
def get_model(): 
    params = {
        ""n_estimators"":300, 
        ""n_jobs"": 3,
        ""random_state"":5436,
    }
    return ExtraTreesClassifier(**params)

# validate model on given dataset and report CV score
def validate_model(model, data):
    skf = StratifiedKFold(n_splits=5, random_state = 4141, shuffle = True)
    stats = cross_validate(
        model, data[0], data[1], 
        groups=None, scoring='roc_auc', 
        cv=skf, n_jobs=None, return_train_score = True
    )
    stats = pd.DataFrame(stats)
    return stats.describe().transpose()

# transforms given train and test datasets using provided function, 
# function parameters can be passed as a dict
def transform_dataset(train, test, func, func_params = {}):
    dataset = pd.concat([train, test], ignore_index = True)
    dataset = func(dataset, **func_params)
    if isinstance(dataset, pd.DataFrame):
        new_train = dataset.iloc[:train.shape[0],:].reset_index(drop = True)
        new_test =  dataset.iloc[train.shape[0]:,:].reset_index(drop = True)
    else:
        new_train = dataset[:train.shape[0]]
        new_test =  dataset[train.shape[0]:]
    return new_train, new_test'",Yes,3,28.0
"MJTCP = 32292 #Michael Jordan total career points
#for each column in dataset creates N column with random integers
def assign_rnd_integer(dataset, number_of_times = 5, seed = MJTCP):
    new_dataset = pd.DataFrame()
    np.random.seed(seed)
    for c in dataset.columns:
        for i in range(number_of_times):
            col_name = c+""_""+str(i)
            unique_vals = dataset[c].unique()
            labels = np.array(list(range(len(unique_vals))))
            np.random.shuffle(labels)
            mapping = pd.DataFrame({c: unique_vals, col_name: labels})
            new_dataset[col_name] = (dataset[[c]]
                                     .merge(mapping, on = c, how = 'left')[col_name]
                                    ).values
    return new_dataset'",Yes,3,12.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {""number_of_times"":5}
)
print(new_train.shape, new_test.shape)
new_train.head(5)",Yes,3,41.0
"validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)",No,3,28.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {""number_of_times"":1}
)
print(new_train.shape, new_test.shape)
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)",Yes,3,28.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {""number_of_times"":10}
)
print(new_train.shape, new_test.shape)
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)",Yes,4,28.0
"from sklearn.preprocessing import OneHotEncoder
# transforms given dataset to OHE representation
def one_hot(dataset):
    ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')
    return ohe.fit_transform(dataset.values)",Yes,5,20.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    one_hot)
print(new_train.shape, new_test.shape)",No,3,12.0
"from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def extract_col_interaction(dataset, col1, col2, tfidf = True):
    data = dataset.groupby([col1])[col2].agg(lambda x: "" "".join(list([str(y) for y in x])))
    if tfidf:
        vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split("" ""))
    else:
        vectorizer = CountVectorizer(tokenizer=lambda x: x.split("" ""))
    
    data_X = vectorizer.fit_transform(data)
    dim_red = TruncatedSVD(n_components=1, random_state = 5115)
    data_X = dim_red.fit_transform(data_X)
    
    result = pd.DataFrame()
    result[col1] = data.index.values
    result[col1+""_{}_svd"".format(col2)] = data_X.ravel()
    return result

import itertools
def get_col_interactions_svd(dataset, tfidf = True):
    new_dataset = pd.DataFrame()
    for col1,col2 in itertools.permutations(dataset.columns, 2):
        data = extract_col_interaction(dataset, col1,col2, tfidf)
        col_name = [x for x in data.columns if ""svd"" in x][0]
        new_dataset[col_name] = dataset[[col1]].merge(data, on = col1, how = 'left')[col_name]
    return new_dataset'",Yes,3,8.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_col_interactions_svd
)
print(new_train.shape, new_test.shape)
new_train.head(5)",Yes,3,41.0
"def get_freq_encoding(dataset):
    new_dataset = pd.DataFrame()
    for c in dataset.columns:
        data = dataset.groupby([c]).size().reset_index()
        new_dataset[c+""_freq""] = dataset[[c]].merge(data, on = c, how = ""left"")[0]
    return new_dataset",No,3,12.0
"new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_freq_encoding
)
print(new_train.shape, new_test.shape)
new_train.head(5)",Yes,3,41.0
"new_train1, new_test1 = transform_dataset(
    train[col4train], test[col4train], get_freq_encoding
)
new_train2, new_test2 = transform_dataset(
    train[col4train], test[col4train], get_col_interactions_svd
)
new_train3, new_test3 = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {""number_of_times"":10}
)

new_train = pd.concat([new_train1, new_train2, new_train3], axis = 1)
new_test = pd.concat([new_test1, new_test2, new_test3], axis = 1)
print(new_train.shape, new_test.shape)",Yes,4,11.0
"model = get_model()
model.fit(new_train.values, y)
predictions = model.predict_proba(new_test)[:,1]

submit = pd.DataFrame()
submit[""Id""] = test[""id""]
submit[""ACTION""] = predictions

submit.to_csv(""submission.csv"", index = False)",Yes,4,25.0
"# Assuring you have the most recent CatBoost release
!pip install catboost -U",No,5,87.0
"# Getting useful tabular processing and generator functions
!git clone https://github.com/lmassaron/deep_learning_for_tabular_data.git",No,2,23.0
"# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings(""ignore"")

# Classifiers
from catboost import CatBoostClassifier, Pool

# Model selection
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import make_scorer",No,5,23.0
"# Loading data directly from CatBoost
from catboost.datasets import amazon

X, Xt = amazon()

y = X[""ACTION""].apply(lambda x: 1 if x == 1 else 0).values
X.drop([""ACTION""], axis=1, inplace=True)",Yes,4,21.0
"# Transforming all the labels of all variables
from sklearn.preprocessing import LabelEncoder

label_encoders = [LabelEncoder() for _ in range(X.shape[1])]

for col, column in enumerate(X.columns):
    label_encoders[col].fit(X[column].append(Xt[column]))
    X[column] = label_encoders[col].transform(X[column])
    Xt[column] = label_encoders[col].transform(Xt[column])",Yes,5,20.0
"# Enconding frequencies instead of labels (so we have some numeric variables)

def frequency_encoding(column, df, df_test=None):
    frequencies = df[column].value_counts().reset_index()
    df_values = df[[column]].merge(frequencies, how='left', 
                                   left_on=column, right_on='index').iloc[:,-1].values
    if df_test is not None:
        df_test_values = df_test[[column]].merge(frequencies, how='left', 
                                                 left_on=column, right_on='index').fillna(1).iloc[:,-1].values
    else:
        df_test_values = None
    return df_values, df_test_values

for column in X.columns:
    train_values, test_values = frequency_encoding(column, X, Xt)
    X[column+'_counts'] = train_values
    Xt[column+'_counts'] = test_values",No,5,20.0
"# Pointing out which variables are categorical and which are numeric
categorical_variables = [col for col in X.columns if '_counts' not in col]
numeric_variables = [col for col in X.columns if '_counts' in col]",No,3,77.0
X.head(),No,5,41.0
Xt.head(),No,5,41.0
"# Counting unique values of categorical variables
X[categorical_variables].nunique()",No,5,54.0
"# Describing numeric variables
X[numeric_variables].describe()",No,5,40.0
"# Initializing a CatBoostClassifier with best parameters
best_params = {'bagging_temperature': 0.6,
               'border_count': 200,
               'depth': 8,
               'iterations': 350,
               'l2_leaf_reg': 30,
               'learning_rate': 0.30,
               'random_strength': 0.01,
               'scale_pos_weight': 0.48}

catb = CatBoostClassifier(**best_params,
                          loss_function='Logloss',
                          eval_metric = 'AUC',
                          nan_mode='Min',
                          thread_count=2,
                          verbose = False)",No,4,4.0
"# CV interations

roc_auc = list()
average_precision = list()
oof = np.zeros(len(X))
best_iteration = list()

for train_idx, test_idx in skf.split(X, y):
    X_train, y_train = X.iloc[train_idx, :], y[train_idx]
    X_test, y_test = X.iloc[test_idx, :], y[test_idx]
    
    train = Pool(data=X_train, 
             label=y_train,            
             feature_names=list(X_train.columns),
             cat_features=categorical_variables)

    test = Pool(data=X_test, 
                label=y_test,
                feature_names=list(X_test.columns),
                cat_features=categorical_variables)

    catb.fit(train,
             verbose_eval=100, 
             early_stopping_rounds=50,
             eval_set=test,
             use_best_model=True,
             #task_type = ""GPU"",
             plot=False)
    
    best_iteration.append(catb.best_iteration_)
    preds = catb.predict_proba(X_test)
    
    oof[test_idx] = preds[:,1]
    
    roc_auc.append(roc_auc_score(y_true=y_test, y_score=preds[:,1]))
    average_precision.append(average_precision_score(y_true=y_test, y_score=preds[:,1]))",Yes,2,7.0
"# Using catboost on all the data for predictions

best_params = {'bagging_temperature': 0.6,
               'border_count': 200,
               'depth': 8,
               'iterations': int(np.median(best_iteration) * 1.3),
               'l2_leaf_reg': 30,
               'learning_rate': 0.30,
               'random_strength': 0.01,
               'scale_pos_weight': 0.48}

catb = CatBoostClassifier(**best_params,
                          loss_function='Logloss',
                          eval_metric = 'AUC',
                          nan_mode='Min',
                          thread_count=2,
                          verbose = False)

train = Pool(data=X, 
             label=y,            
             feature_names=list(X_train.columns),
             cat_features=categorical_variables)

catb.fit(train,
         verbose_eval=100,
         #task_type = ""GPU"",
         plot=False)

submission = pd.DataFrame(Xt.id)
Xt_pool = Pool(data=Xt[list(X_train.columns)],
               feature_names=list(X_train.columns),
               cat_features=categorical_variables)
submission['Action'] = catb.predict_proba(Xt_pool)[:,1]
submission.to_csv(""catboost_submission.csv"", index=False)

cat_boost_submission = submission.copy()'",Yes,3,7.0
"import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.layers import Input, Embedding, Reshape, GlobalAveragePooling1D
from tensorflow.keras.layers import Flatten, concatenate, Concatenate, Lambda, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Reshape, MaxPooling1D,BatchNormalization, AveragePooling1D, Conv1D
from tensorflow.keras.layers import Activation, LeakyReLU
from tensorflow.keras.optimizers import SGD, Adam, Nadam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2, l1_l2
from keras.losses import binary_crossentropy

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

import matplotlib.pyplot as plt",No,5,22.0
"# Registering custom activations suitable for tabular problems

from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation, LeakyReLU
from deep_learning_for_tabular_data.tabular import gelu, Mish, mish

# Add gelu so we can use it as a string
get_custom_objects().update({'gelu': Activation(gelu)})

# Add mish so we can use it as a string
get_custom_objects().update({'mish': Mish(mish)})

# Add leaky-relu so we can use it as a string
get_custom_objects().update({'leaky-relu': Activation(LeakyReLU(alpha=0.2))})",Yes,5,22.0
"# Parametric architecture

def tabular_dnn(numeric_variables, categorical_variables, categorical_counts,
                feature_selection_dropout=0.2, categorical_dropout=0.1,
                first_dense = 256, second_dense = 256, dense_dropout = 0.2, 
                activation_type=gelu):
    
    numerical_inputs = Input(shape=(len(numeric_variables),))
    numerical_normalization = BatchNormalization()(numerical_inputs)
    numerical_feature_selection = Dropout(feature_selection_dropout)(numerical_normalization)

    categorical_inputs = []
    categorical_embeddings = []
    for category in  categorical_variables:
        categorical_inputs.append(Input(shape=[1], name=category))
        category_counts = categorical_counts[category]
        categorical_embeddings.append(
            Embedding(category_counts+1, 
                      int(np.log1p(category_counts)+1), 
                      name = category + ""_embed"")(categorical_inputs[-1]))

    categorical_logits = Concatenate(name = ""categorical_conc"")([Flatten()(SpatialDropout1D(categorical_dropout)(cat_emb)) 
                                                                 for cat_emb in categorical_embeddings])

    x = concatenate([numerical_feature_selection, categorical_logits])
    x = Dense(first_dense, activation=activation_type)(x)
    x = Dropout(dense_dropout)(x)  
    x = Dense(second_dense, activation=activation_type)(x)
    x = Dropout(dense_dropout)(x)
    output = Dense(1, activation=""sigmoid"")(x)
    model = Model([numerical_inputs] + categorical_inputs, output)
    
    return model",No,4,4.0
"# Useful functions

from tensorflow.keras.metrics import AUC

def mAP(y_true, y_pred):
    return tf.py_func(average_precision_score, (y_true, y_pred), tf.double)

def compile_model(model, loss, metrics, optimizer):
    model.compile(loss=loss, metrics=metrics, optimizer=optimizer)
    return model

def plot_keras_history(history, measures):
    """"""
    history: Keras training history
    measures = list of names of measures
    """"""
    rows = len(measures) // 2 + len(measures) % 2
    fig, panels = plt.subplots(rows, 2, figsize=(15, 5))
    plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.4, wspace=0.2)
    try:
        panels = [item for sublist in panels for item in sublist]
    except:
        pass
    for k, measure in enumerate(measures):
        panel = panels[k]
        panel.set_title(measure + ' history')
        panel.plot(history.epoch, history.history[measure], label=""Train ""+measure)
        panel.plot(history.epoch, history.history[""val_""+measure], label=""Validation ""+measure)
        panel.set(xlabel='epochs', ylabel=measure)
        panel.legend()
        
    plt.show(fig)'",Yes,5,35.0
"# Global training settings

SEED = 42
FOLDS = 5
BATCH_SIZE = 512",No,5,77.0
"from deep_learning_for_tabular_data.tabular import TabularTransformer, DataGenerator

# Setting the CV strategy
skf = StratifiedKFold(n_splits=FOLDS, 
                      shuffle=True, 
                      random_state=SEED)

# CV Iteration
roc_auc = list()
average_precision = list()
oof = np.zeros(len(X))
best_iteration = list()

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    
    tb = TabularTransformer(numeric = numeric_variables,
                        ordinal = [],
                        lowcat  = [],
                        highcat = categorical_variables)

    tb.fit(X.iloc[train_idx])
    sizes = tb.shape(X.iloc[train_idx])
    categorical_levels = dict(zip(categorical_variables, sizes[1:]))
    print(f""Input array sizes: {sizes}"")
    print(f""Categorical levels: {categorical_levels}\
"")
    
    model = tabular_dnn(numeric_variables, categorical_variables,
                        categorical_levels, 
                        feature_selection_dropout=0.1,
                        categorical_dropout=0.1,
                        first_dense = 256,
                        second_dense = 256,
                        dense_dropout = 0.1,
                        activation_type=gelu)
    
    model = compile_model(model, binary_crossentropy, [AUC(name='auc'), mAP], Adam(learning_rate=0.0001))
    
    train_batch = DataGenerator(X.iloc[train_idx], 
                                y[train_idx],
                                tabular_transformer=tb,
                                batch_size=BATCH_SIZE,
                                shuffle=True)
    
    history = model.fit_generator(train_batch,
                                  validation_data=(tb.transform(X.iloc[test_idx]), y[test_idx]),
                                  epochs=30,
                                  callbacks=[model_checkpoint, early_stopping, reduce_learning],
                                  class_weight=[1.0, (np.sum(y==0) / np.sum(y==1))],
                                  verbose=1)
    
    print(""\
FOLD %i"" % fold)
    plot_keras_history(history, measures=['auc', 'loss'])
    
    best_iteration.append(np.argmax(history.history['val_auc']) + 1)
    preds = model.predict(tb.transform(X.iloc[test_idx]),
                          verbose=1,
                          batch_size=1024).flatten()

    oof[test_idx] = preds

    roc_auc.append(roc_auc_score(y_true=y[test_idx], y_score=preds))
    average_precision.append(average_precision_score(y_true=y[test_idx], y_score=preds))'",Yes,2,48.0
"# We train on all the examples, using a rule of thumb for the number of iterations

tb = TabularTransformer(numeric = numeric_variables,
                        ordinal = [],
                        lowcat  = [],
                        highcat = categorical_variables)

tb.fit(X)
sizes = tb.shape(X)
categorical_levels = dict(zip(categorical_variables, sizes[1:]))
print(f""Input array sizes: {sizes}"")
print(f""Categorical levels: {categorical_levels}\
"")

model = tabular_dnn(numeric_variables, categorical_variables,
                    categorical_levels, 
                    feature_selection_dropout=0.1,
                    categorical_dropout=0.1,
                    first_dense = 256,
                    second_dense = 256,
                    dense_dropout = 0.1,
                    activation_type=gelu)
    
model = compile_model(model, binary_crossentropy, [AUC(name='auc'), mAP], Adam(learning_rate=0.0001))    

train_batch = DataGenerator(X, y,
                            tabular_transformer=tb,
                            batch_size=BATCH_SIZE,
                            shuffle=True)

history = model.fit_generator(train_batch,
                              epochs=int(np.median(best_iteration)),
                              class_weight=[1.0, (np.sum(y==0) / np.sum(y==1))],
                              verbose=1)'",Yes,2,8.0
"# Predicting and submission
preds = model.predict(tb.transform(Xt[X.columns]),
                      verbose=1,
                      batch_size=1024).flatten()

submission = pd.DataFrame(Xt.id)
submission['Action'] = preds
submission.to_csv(""tabular_dnn_submission.csv"", index=False)

tabular_dnn_submission = submission.copy()'",Yes,5,25.0
"from scipy.stats import rankdata

# We use normalized ranks because probabilities emissions from the two models may differ
dnn_rank = rankdata(tabular_dnn_submission.Action, method='dense') / len(Xt)
cat_rank = rankdata(cat_boost_submission.Action, method='dense') / len(Xt)

submission = pd.DataFrame(Xt.id)
submission['Action'] = 0.5 * dnn_rank + 0.5 * cat_rank 
submission.to_csv(""blended_submission.csv"", index=False)'",Yes,5,25.0
"import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"data = pd.read_csv('../input/amazon-employee-access-challenge/train.csv')
print(data.shape)
data.head()",Yes,4,45.0
data_explore = data.copy(),No,4,12.0
data_explore.info(),No,5,40.0
"sns.countplot(x='ACTION', data=data_explore)",No,5,33.0
"data_explore_role_dept = data_explore[['ROLE_DEPTNAME', ""ACTION""]].groupby(by='ROLE_DEPTNAME').count()
data_explore_role_dept.sort_values('ACTION', ascending=False).head(n=15).transpose()'",No,2,40.0
"data_explore_role_codes = data_explore[['ROLE_CODE', ""ACTION""]].groupby(by='ROLE_CODE').count()
data_explore_role_codes.sort_values('ACTION', ascending=False).head(n=15).transpose()'",Yes,3,12.0
"data_explore_role_family = data_explore[['ROLE_FAMILY', ""ACTION""]].groupby(by='ROLE_FAMILY').count()
data_explore_role_family.sort_values('ACTION', ascending=False).head(n=15).transpose()'",Yes,3,12.0
"plt.figure(figsize=(12, 7))
corr_matrix = data_explore.corr()
sns.heatmap(corr_matrix, mask=np.zeros_like(corr_matrix, dtype=np.bool), square=True, annot=True, cbar=False)
plt.tight_layout()",No,5,80.0
corr_matrix['ACTION'].sort_values(ascending=False),No,2,40.0
"from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer",No,5,22.0
"X = data.drop(columns=['ACTION'], axis=1).copy()
y = data['ACTION'].copy()
X.shape, y.shape",No,3,21.0
"from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(X, y):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]

X_train = strat_train_set.drop('ACTION', axis=1)
y_train = strat_train_set['ACTION'].copy()
X_test = strat_test_set.drop('ACTION', axis=1)
y_test = strat_test_set['ACTION'].copy()
X_train.shape, X_test.shape",Yes,4,13.0
"from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)",Yes,5,84.0
"from sklearn.metrics import matthews_corrcoef, make_scorer, roc_auc_score, roc_curve
Matthew = make_scorer(matthews_corrcoef)

results = []

def plot_custom_roc_curve(clf_name, y_true, y_scores):
    auc_score = np.round(roc_auc_score(y_true, y_scores), 3)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    plt.plot(fpr, tpr, linewidth=2, label=clf_name+"" (AUC Score: {})"".format(str(auc_score)))
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
    plt.axis([0, 1, 0, 1])
    plt.xlabel(""FPR"", fontsize=16)
    plt.ylabel(""TPR"", fontsize=16)
    plt.legend()
    
    
def performance_measures(model, X_tr=X_train_transformed, y_tr=y_train, X_ts=X_test_transformed, y_ts=y_test,
                         store_results=True):
    train_mcc = cross_val_score(model, X_tr, y_tr, scoring=Matthew, cv=kf, n_jobs=-1)
    test_mcc = cross_val_score(model, X_ts, y_ts, scoring=Matthew, cv=kf, n_jobs=-1)
    print(""Mean Train MCC: {}\
Mean Test MCC: {}"".format(train_mcc.mean(), test_mcc.mean()))

    
    train_roc_auc = cross_val_score(model, X_tr, y_tr, scoring='roc_auc', cv=kf, n_jobs=-1)
    test_roc_auc = cross_val_score(model, X_ts, y_ts, scoring='roc_auc', cv=kf, n_jobs=-1)
    print(""Mean Train ROC AUC Score: {}\
Mean Test ROC AUC Score: {}"".format(train_roc_auc.mean(), test_roc_auc.mean()))
    
    if store_results:
        results.append([model.__class__.__name__, np.round(np.mean(train_roc_auc), 3), np.round(np.mean(test_roc_auc), 3), np.round(np.mean(train_mcc), 3), np.round(np.mean(test_mcc), 3)])'",Yes,5,84.0
"def plot_feature_importance(feature_columns, importance_values, top_n_features=10):
    feature_imp = [ col for col in zip(feature_columns, importance_values)]
    feature_imp.sort(key=lambda x:x[1], reverse=True)
    
    if top_n_features:
        imp = pd.DataFrame(feature_imp[0:top_n_features], columns=['feature', 'importance'])
    else:
        imp = pd.DataFrame(feature_imp, columns=['feature', 'importance'])
    plt.figure(figsize=(10, 8))
    sns.barplot(y='feature', x='importance', data=imp, orient='h')
    plt.title('Most Important Features', fontsize=16)
    plt.ylabel(""Feature"", fontsize=16)
    plt.xlabel("""")
    plt.show()'",No,5,79.0
"from sklearn.linear_model import LogisticRegression

logistic_reg = LogisticRegression(solver='liblinear', C=1, penalty='l2', max_iter=1000, random_state=42, n_jobs=-1)
logistic_reg.fit(X_train_transformed, y_train)",Yes,5,7.0
"plot_feature_importance(feature_columns, logistic_reg.coef_[0], top_n_features=15)",No,5,79.0
performance_measures(logistic_reg),No,4,35.0
"from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=300, max_depth=16, random_state=42,n_jobs=-1)
forest_clf.fit(X_train_transformed, y_train)",Yes,5,7.0
"plot_feature_importance(feature_columns, forest_clf.feature_importances_, top_n_features=15)",No,5,79.0
performance_measures(forest_clf),No,4,35.0
"from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=300, max_depth=16, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_clf.fit(X_train_transformed, y_train)",Yes,5,7.0
"plot_feature_importance(feature_columns, xgb_clf.feature_importances_, top_n_features=15)",No,5,79.0
performance_measures(xgb_clf),No,4,35.0
"from catboost import CatBoostClassifier

catboost_clf = CatBoostClassifier(loss_function='Logloss', iterations=500, depth=6, l2_leaf_reg=1, 
                                  cat_features=list(range(X_cb_train_transformed.shape[1])), 
                                  eval_metric='AUC', random_state=42, verbose=0)
catboost_clf.fit(X_cb_train_transformed, y_train)",Yes,5,7.0
"performance_measures(catboost_clf, X_tr=X_cb_train_transformed, X_ts=X_cb_test_transformed)",No,5,53.0
"plot_feature_importance(feature_columns, catboost_clf.feature_importances_, top_n_features=15)",No,5,79.0
"from sklearn.ensemble import VotingClassifier

voting_reg = VotingClassifier(estimators=named_estimators, voting='soft', n_jobs=-1)
voting_reg.fit(X_train, y_train)",Yes,5,7.0
"performance_measures(voting_reg, X_tr=X_train, X_ts=X_test)",No,4,35.0
"result_df = pd.DataFrame(results, columns=['Model', 'CV Train AUC Score', 'CV Test AUC Score', 'CV Train MCC', 'CV Test MCC'])
result_df",No,5,12.0
"plt.figure(figsize=(8, 5))
plot_custom_roc_curve('Logistic Regression', y_test, logistic_reg.decision_function(X_test_transformed))
plot_custom_roc_curve('Random Forest', y_test, forest_clf.predict_proba(X_test_transformed)[:,1])
plot_custom_roc_curve('XGBoost', y_test, xgb_clf.predict_proba(X_test_transformed)[:,1])
plot_custom_roc_curve('CatBoost', y_test, catboost_clf.predict_proba(X_cb_test_transformed)[:,1])
plot_custom_roc_curve('Soft Voting', y_test, voting_reg.predict_proba(X_test)[:,1])
plt.show()",No,5,53.0
"final_model = Pipeline([('pre_process', cat_boost_pre_process),
                        ('catboost', catboost_clf)])
final_model.fit(X_train, y_train)",Yes,5,7.0
"test_data = pd.read_csv('../input/amazon-employee-access-challenge/test.csv')
test_data.head()",No,4,45.0
"output = pd.DataFrame(test_data['id'])
test_data = test_data.drop('id', axis=1)",Yes,4,12.0
predictions = final_model.predict(test_data),No,5,48.0
output['ACTION'] = predictions.copy(),No,3,12.0
"output.to_csv(""./submission.csv"", index=False)",No,5,25.0
"MAX_WEIGHT = 50.0

toys = {
    ""horse"":  { ""sample"": lambda: max(0, np.random.normal(5,2,1)[0]), ""sample_type"": ""normal(5,2)"" },
    ""ball"":   { ""sample"": lambda: max(0, 1 + np.random.normal(1,0.3,1)[0]), ""sample_type"": ""normal(1,0.3)"" },
    ""bike"":   { ""sample"": lambda: max(0, np.random.normal(20,10,1)[0]), ""sample_type"": ""normal(20,10)"" },
    ""train"":  { ""sample"": lambda: max(0, np.random.normal(10,5,1)[0]), ""sample_type"": ""normal(10,5)"" },
    ""coal"":   { ""sample"": lambda: 47 * np.random.beta(0.5,0.5,1)[0], ""sample_type"": ""47*beta(0.5,0.5)"" },
    ""book"":   { ""sample"": lambda: np.random.chisquare(2,1)[0], ""sample_type"": ""chi(2)"" },
    ""doll"":   { ""sample"": lambda: np.random.gamma(5,1,1)[0], ""sample_type"": ""gamma(5,1)"" },
    ""block"":  { ""sample"": lambda: np.random.triangular(5,10,20,1)[0], ""sample_type"": ""triagl(5,10,20)"" },
    ""gloves"": { ""sample"": lambda: 3.0 + np.random.rand(1)[0] if np.random.rand(1) < 0.3 else np.random.rand(1)[0], ""sample_type"": ""0.3:3+rand(1), 0.7:rand(1)"" },
}
toy_names = list(toys)

gifts_df = pd.read_csv(""../input/gifts.csv"", sep="","")
gifts = gifts_df[""GiftId""].values
print(""{} gifts"".format(len(gifts)))

for t in toys:
    # get ranges
    samples = [toys[t][""sample""]() for _ in range(1000)]
    toys[t][""max""] = max(samples)
    toys[t][""min""] = min(samples)
    
    # get gift counts
    ids = [g for g in gifts if t in g.split(""_"")[0]]
    toys[t][""ids""] = ids
    toys[t][""count""] = len(ids)
    
    # print toy type stats
    print(""{:4}\\tdist: {:26}\\trange:{:5.2f} - {:5.2f}\\tcount:{:6,}"".format(t, toys[t][""sample_type""], toys[t][""min""], toys[t][""max""], toys[t][""count""]))'",Yes,3,8.0
"X_test['Store'].nunique(), X_test['Dept'].nunique(), X_test['Date'].nunique()",No,5,54.0
"X_test.shape, X_train.shape",No,5,58.0
"X_train['Year'] = pd.DatetimeIndex(X_train['Date']).year
X_train['Month'] = pd.DatetimeIndex(X_train['Date']).month
X_train['woy'] = pd.DatetimeIndex(X_train['Date']).weekofyear
X_train['quarter'] = pd.DatetimeIndex(X_train['Date']).quarter

X_test['Year'] = pd.DatetimeIndex(X_test['Date']).year
X_test['Month'] = pd.DatetimeIndex(X_test['Date']).month
X_test['woy'] = pd.DatetimeIndex(X_test['Date']).weekofyear
X_test['quarter'] = pd.DatetimeIndex(X_test['Date']).quarter

## for future reference ##
# df['dow'] = df.index.dayofweek
# df['doy'] = df.index.dayofyear",No,5,8.0
X_all['Store'].unique(),No,5,57.0
"cols_num = [col for col in X_train.columns if X_train[col].dtype in [float, int]]
ncols = len(cols_num) // 4
fig, axes = plt.subplots(ncols=ncols, nrows=5, figsize=(30,16))

i = 1
for j, col in enumerate(cols_num):
    sns.distplot(X_train[col], bins=10, ax=axes[i-1][j % ncols])

    if j % ncols == (ncols - 1):
        i += 1
        
plt.tight_layout()",No,5,33.0
"X_train_tf['Date2'] = pd.to_datetime(X_train_tf['Date'], utc = True)
X_test['Date2'] = pd.to_datetime(X_test['Date'], utc = True)

X_train_tf['Weekly_Sales_tf_Lag_52_Weeks'] = X_train_tf.merge(X_all, 
                                                   left_on=['Store', 'Dept', 'Date2'], 
                                                   right_on=['Store', 'Dept', 'Date2'],
                                                   how='inner')['Weekly_Sales_tf_Lag_52_Weeks']
X_test['Weekly_Sales_tf_Lag_52_Weeks'] = X_test.merge(X_all, 
                                                 left_on=['Store', 'Dept', 'Date2'], 
                                                 right_on=['Store', 'Dept', 'Date2'],
                                                 how='inner')['Weekly_Sales_tf_Lag_52_Weeks']

X_train_tf.drop(['Date2', 'outlier'], axis=1, inplace=True)
X_test.drop('Date2', axis=1, inplace=True)

X_test.isna().sum()",No,3,32.0
"b""fig, axes = plt.subplots(ncols=5, figsize=(20,8))\nsns.distplot(X_train['Weekly_Sales'], bins=10, ax=axes[0]).set_title('Weekly Sales')\nsns.distplot(X_train['Weekly_Sales_Log'], bins=10, ax=axes[1]).set_title('Log(1+Weekly Sales)')\nsns.distplot(X_train_tf['Weekly_Sales_Log'], bins=10, ax=axes[2]).set_title('Log(1+Weekly Sales)\\nno outliers')\nsns.distplot(X_train['Weekly_Sales_tf'], bins=10, ax=axes[3]).set_title('(1+Weekly Sales)\\nQauntile Transformer')\nsns.distplot(X_train_tf['Weekly_Sales_tf'], bins=10, ax=axes[4]).set_title('(1+Weekly Sales)\\nQauntile Transformer\\nno outliers')\n\nplt.tight_layout()""",No,5,33.0
"print(X_train_tf.shape)

X_train_tf.head()",No,4,41.0
"b""print(X_train_tf['Type'].value_counts(), '\\n', X_test['Type'].value_counts())\nprint(X_train_tf['IsHoliday'].value_counts(), '\\n', X_test['IsHoliday'].value_counts())""",No,5,72.0
"lbl_encoder = LabelEncoder()

X_train_tf['IsHoliday'] = X_train_tf['IsHoliday'].replace(True, 5).replace(False, 1).values # go off the custom weighted-mae function
X_train_tf['Type'] = lbl_encoder.fit_transform(X_train_tf['Type'])

X_test['IsHoliday'] = X_test['IsHoliday'].replace(True, 5).replace(False, 1).values # go off the custom weighted-mae function
X_test['Type'] = lbl_encoder.transform(X_test['Type'])",No,5,20.0
X_train_tf.head(),No,5,41.0
"def weighted_mae_custom(y_true, y_pred):
    '''
    Custom weighting function as specified in the evaluation section.
    '''
    weights = X_train_tf['IsHoliday']
    sample_weights = pd.Series(weights.loc[y_true.index.values].values.reshape(-1)).dropna()
    return (1.0 / np.sum(sample_weights)) * np.sum(sample_weights * np.abs(y_true - y_pred))

weighted_mae = make_scorer(weighted_mae_custom)",No,5,84.0
X_train_tf.dtypes,No,5,70.0
"X_test.drop('Date', axis=1).isna().sum()",No,5,39.0
"## QuantileTransformer ##
X_test['Weekly_Sales'] = qt.inverse_transform(best_models[2].predict(X_test.drop('Date', axis=1)).reshape(-1, 1)) + 1
X_test.head()",No,3,8.0
"import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
print(os.listdir('../input'))
file = ['train_users.csv', 'age_gender_bkts.csv', 'sessions.csv', 'countries.csv', 'test_users.csv']
data = {}
for f in file:
    data[f.replace('.csv','')]=pd.read_csv('../input/'+f)
    
train = data['train_users']
test = data['test_users']
# train = train.fillna(-100)
# test = test.fillna(-100)

age = data['age_gender_bkts']
sessions = data['sessions']
country = data['countries']
target = train['country_destination']
train = train.drop(['country_destination'],axis=1)

",No,3,22.0
"# temp = pd.DataFrame(train.apply(lambda row: isinstance(row['date_first_booking'], float), axis = 1))
# temp['destination'] = (target == 'NDF')
# temp['comparison'] = temp.apply(lambda x: x[0] != x['destination'], axis = 1)
# temp.apply(sum)",No,4,8.0
"# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, RandomizedLasso
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import cross_validation
from sklearn.preprocessing import OneHotEncoder, Imputer, LabelEncoder
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline",No,5,23.0
" plot_params = {
  ""font.size"":14.0,
  ""figure.figsize"": (6, 4),
   ""axes.labelsize"": ""large"",
   ""figure.autolayout"": True,
   ""patch.edgecolor"":""white"",
   ""axes.facecolor"": ""#f0f0f0"",
   ""patch.edgecolor"": ""#f0f0f0"",
     ""figure.facecolor"": ""#f0f0f0"",
  ""grid.linestyle"": ""-"",
  ""grid.linewidth"": 1.0,
  ""grid.color"": ""#cbcbcb"",
  ""savefig.edgecolor"": ""#f0f0f0"",
  ""savefig.facecolor"": ""#f0f0f0""
}
sns.set(rc=plot_params)",No,5,23.0
"# get airbnb & test csv files as a DataFrame
airbnb_df  = pd.read_csv('../input/train_users.csv')
test_df    = pd.read_csv('../input/test_users.csv')

# preview the data
airbnb_df.head()",No,4,45.0
"# drop unnecessary columns, these columns won't be useful in analysis and prediction
airbnb_df  = airbnb_df.drop(['date_account_created','timestamp_first_active'], axis=1)
test_df    = test_df.drop(['date_account_created','timestamp_first_active'], axis=1)",No,5,10.0
"airbnb_df['booked'] = (airbnb_df['country_destination'] != 'NDF').astype(int)
gp = airbnb_df[['id','country_destination']].groupby('country_destination').count()
ax = gp.sort('id').plot(kind='barh', color=['#0059b3'])
ax.set_xlabel('# of bookings')
ax.set_ylabel('country destination')
ax.legend_.remove()",No,5,81.0
"# date_first_booking

def get_year(date):
    if date == date: 
        return int(str(date)[:4])
    return date

def get_month(date):
    if date == date: 
        return int(str(date)[5:7])
    return date

# Create Year and Month columns
airbnb_df['Year']  = airbnb_df['date_first_booking'].apply(get_year)
airbnb_df['Month'] = airbnb_df['date_first_booking'].apply(get_month)

test_df['Year']  = test_df['date_first_booking'].apply(get_year)
test_df['Month'] = test_df['date_first_booking'].apply(get_month)

# fill NaN
airbnb_df['Year'].fillna(airbnb_df['Year'].median(), inplace=True)
airbnb_df['Month'].fillna(airbnb_df['Month'].median(), inplace=True)

test_df['Year'].fillna(test_df['Year'].median(), inplace=True)
test_df['Month'].fillna(test_df['Month'].median(), inplace=True)

# convert type to integer
airbnb_df[['Year', 'Month']] = airbnb_df[['Year', 'Month']].astype(int)
test_df[['Year', 'Month']]   = test_df[['Year', 'Month']].astype(int)
",No,3,8.0
"# age<br><br># assign all age values > 100 to NaN, these NaN values will be replaced with real ages below<br>airbnb_df[""age""][airbnb_df[""age""] > 100] = np.NaN<br>test_df[""age""][test_df[""age""] > 100]     = np.NaN<br><br># get average, std, and number of NaN values in airbnb_df<br>average_age_airbnb   = airbnb_df[""age""].mean()<br>std_age_airbnb       = airbnb_df[""age""].std()<br>count_nan_age_airbnb = airbnb_df[""age""].isnull().sum()<br><br># get average, std, and number of NaN values in test_df<br>average_age_test   = test_df[""age""].mean()<br>std_age_test       = test_df[""age""].std()<br>count_nan_age_test = test_df[""age""].isnull().sum()<br><br># generate random numbers between (mean - std) & (mean + std)<br>rand_1 = np.random.randint(average_age_airbnb - std_age_airbnb, average_age_airbnb + std_age_airbnb, size = count_nan_age_airbnb)<br>rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)<br><br># fill NaN values in Age column with random values generated<br>airbnb_df[""age""][np.isnan(airbnb_df[""age""])] = rand_1<br>test_df[""age""][np.isnan(test_df[""age""])]     = rand_2<br><br># convert type to integer<br>airbnb_df[\'age\'] = airbnb_df[\'age\'].astype(int)<br>test_df[\'age\']   = test_df[\'age\'].astype(int)",No,3,17.0
"# signup_method<br>airbnb_df[""signup_method""] = (airbnb_df[""signup_method""] == ""basic"").astype(int)<br>test_df[""signup_method""]   = (test_df[""signup_method""] == ""basic"").astype(int)<br><br># signup_flow<br>airbnb_df[""signup_flow""] = (airbnb_df[""signup_flow""] == 3).astype(int)<br>test_df[""signup_flow""]   = (test_df[""signup_flow""] == 3).astype(int)<br><br># language<br>airbnb_df[""language""] = (airbnb_df[""language""] == \'en\').astype(int)<br>test_df[""language""]   = (test_df[""language""] == \'en\').astype(int)<br><br># affiliate_channel<br>airbnb_df[""affiliate_channel""] = (airbnb_df[""affiliate_channel""] == \'direct\').astype(int)<br>test_df[""affiliate_channel""]   = (test_df[""affiliate_channel""] == \'direct\').astype(int)<br><br># affiliate_provider<br>airbnb_df[""affiliate_provider""] = (airbnb_df[""affiliate_provider""] == \'direct\').astype(int)<br>test_df[""affiliate_provider""]   = (test_df[""affiliate_provider""] == \'direct\').astype(int)<br>",No,5,16.0
"for f in airbnb_df.columns:<br>    if f == ""country_destination"" or f == ""id"": continue<br>    if airbnb_df[f].dtype == \'object\':<br>        lbl = LabelEncoder()<br>        lbl.fit(np.unique(list(airbnb_df[f].values) + list(test_df[f].values)))<br>        airbnb_df[f] = lbl.transform(list(airbnb_df[f].values))<br>        test_df[f]   = lbl.transform(list(test_df[f].values))",No,5,20.0
"X = airbnb_df.drop([""country_destination"", ""id"", \'booked\'],axis=1)<br>y = airbnb_df[""country_destination""]<br>test  = test_df.drop(""id"",axis=1).copy()",No,4,21.0
"# modify country_destination to numerical values
country_num_dic = {'NDF': 0, 'US': 1, 'other': 2, 'FR': 3, 'IT': 4, 'GB': 5, 'ES': 6, 'CA': 7, 'DE': 8, 'NL': 9, 'AU': 10, 'PT': 11}
num_country_dic = {y:x for x,y in country_num_dic.items()}
y = y.map(country_num_dic)",No,5,20.0
"# convert type to integer
ypred = ypred.astype(int)
# change values back to original country symbols
ypred = Series(ypred).map(num_country_dic)",No,4,16.0
"<br># Create submission<br><br>country_df = pd.DataFrame({<br>        ""id"": test_df[""id""],<br>        ""country"": ypred<br>    })<br><br>submission = DataFrame(columns=[""id"", ""country""])<br><br># sort countries according to most probable destination country <br>for key in country_df[\'country\'].value_counts().index:<br>    submission = pd.concat([submission, country_df[country_df[""country""] == key]], ignore_index=True)<br><br>submission.to_csv(\'airbnb.csv\', index=False)",No,5,25.0
"from pandas import Series,DataFrame<br>import pandas as pd<br><br># numpy, matplotlib, seaborn<br>import numpy as np<br><br>names=[<br>\'Field6\',<br>\'Field7\',<br>\'Field8\',<br>\'Field10\',<br>\'CoverageField2A\',<br>\'CoverageField2B\',<br>\'CoverageField3A\',<br>\'CoverageField4B\',<br>\'CoverageField5A\',<br>\'CoverageField5B\',<br>\'CoverageField6A\',<br>\'CoverageField6B\',<br>\'CoverageField8\',<br>\'CoverageField11A\',<br>\'CoverageField11B\',<br>\'SalesField1A\',<br>\'SalesField1B\',<br>\'SalesField2A\',<br>\'SalesField2B\',<br>\'SalesField3\',<br>\'SalesField4\',<br>\'SalesField6\',<br>\'SalesField7\',<br>\'SalesField8\',<br>\'SalesField9\',<br>\'SalesField10\',<br>\'SalesField12\',<br>\'SalesField13\',<br>\'SalesField14\',<br>\'SalesField15\',<br>\'PersonalField1\',<br>\'PersonalField2\',<br>\'PersonalField4B\',<br>\'PersonalField5\',<br>\'PersonalField6\',<br>\'PersonalField7\',<br>\'PersonalField8\',<br>\'PersonalField9\',<br>\'PersonalField10A\',<br>\'PersonalField10B\',<br>\'PersonalField11\',<br>\'PersonalField12\',<br>\'PersonalField13\',<br>\'PersonalField15\',<br>\'PersonalField16\',<br>\'PersonalField17\',<br>\'PersonalField18\',<br>\'PersonalField19\',<br>\'PersonalField22\',<br>\'PersonalField23\',<br>\'PersonalField25\',<br>\'PersonalField27\',<br>\'PersonalField29\',<br>\'PersonalField33\',<br>\'PersonalField34\',<br>\'PersonalField36\',<br>\'PersonalField37\',<br>\'PersonalField38\',<br>\'PersonalField39\',<br>\'PersonalField40\',<br>\'PersonalField41\',<br>\'PersonalField42\',<br>\'PersonalField47\',<br>\'PersonalField48\',<br>\'PersonalField49\',<br>\'PersonalField50\',<br>\'PersonalField51\',<br>\'PersonalField52\',<br>\'PersonalField53\',<br>\'PersonalField56\',<br>\'PersonalField57\',<br>\'PersonalField59\',<br>\'PersonalField60\',<br>\'PersonalField62\',<br>\'PersonalField63\',<br>\'PersonalField64\',<br>\'PersonalField66\',<br>\'PersonalField69\',<br>\'PersonalField70\',<br>\'PersonalField71\',<br>\'PersonalField74\',<br>\'PersonalField75\',<br>\'PersonalField77\',<br>\'PersonalField81\',<br>\'PersonalField82\',<br>\'PersonalField83\',<br>\'PersonalField84\',<br>\'PropertyField1A\',<br>\'PropertyField1B\',<br>\'PropertyField2A\',<br>\'PropertyField2B\',<br>\'PropertyField3\',<br>\'PropertyField4\',<br>\'PropertyField6\',<br>\'PropertyField7\',<br>\'PropertyField8\',<br>\'PropertyField9\',<br>\'PropertyField10\',<br>\'PropertyField11B\',<br>\'PropertyField12\',<br>\'PropertyField13\',<br>\'PropertyField14\',<br>\'PropertyField15\',<br>\'PropertyField16B\',<br>\'PropertyField18\',<br>\'PropertyField19\',<br>\'PropertyField20\',<br>\'PropertyField21B\',<br>\'PropertyField22\',<br>\'PropertyField23\',<br>\'PropertyField24B\',<br>\'PropertyField25\',<br>\'PropertyField26A\',<br>\'PropertyField26B\',<br>\'PropertyField27\',<br>\'PropertyField28\',<br>\'PropertyField29\',<br>\'PropertyField30\',<br>\'PropertyField31\',<br>\'PropertyField32\',<br>\'PropertyField33\',<br>\'PropertyField34\',<br>\'PropertyField35\',<br>\'PropertyField36\',<br>\'PropertyField37\',<br>\'PropertyField38\',<br>\'PropertyField39A\',<br>\'GeographicField1A\',<br>\'GeographicField2B\',<br>\'GeographicField4A\',<br>\'GeographicField4B\',<br>\'GeographicField5A\',<br>\'GeographicField6A\',<br>\'GeographicField8A\',<br>\'GeographicField11A\',<br>\'GeographicField13B\',<br>\'GeographicField15A\',<br>\'GeographicField16B\',<br>\'GeographicField17A\',<br>\'GeographicField17B\',<br>\'GeographicField18A\',<br>\'GeographicField20B\',<br>\'GeographicField21B\',<br>\'GeographicField22A\',<br>\'GeographicField22B\',<br>\'GeographicField23A\',<br>\'GeographicField23B\',<br>\'GeographicField24A\',<br>\'GeographicField26A\',<br>\'GeographicField27A\',<br>\'GeographicField29B\',<br>\'GeographicField30B\',<br>\'GeographicField32A\',<br>\'GeographicField33B\',<br>\'GeographicField36B\',<br>\'GeographicField37B\',<br>\'GeographicField38A\',<br>\'GeographicField39B\',<br>\'GeographicField41A\',<br>\'GeographicField41B\',<br>\'GeographicField42B\',<br>\'GeographicField43A\',<br>\'GeographicField44A\',<br>\'GeographicField45A\',<br>\'GeographicField45B\',<br>\'GeographicField46B\',<br>\'GeographicField48A\',<br>\'GeographicField48B\',<br>\'GeographicField50B\',<br>\'GeographicField52B\',<br>\'GeographicField53B\',<br>\'GeographicField54B\',<br>\'GeographicField55B\',<br>\'GeographicField56A\',<br>\'GeographicField59A\',<br>\'GeographicField59B\',<br>\'GeographicField60A\',<br>\'GeographicField60B\',<br>\'GeographicField61A\',<br>\'GeographicField61B\',<br>\'GeographicField62A\',<br>\'GeographicField62B\',<br>\'GeographicField63\',<br>\'Year\',<br>\'Month\'<br>]<br><br><br>import random<br>from datetime import datetime<br>import pandas as pd<br>from pandas import DataFrame as df<br>import numpy as np<br>from sklearn.ensemble import RandomForestClassifier<br>from sklearn.ensemble import ExtraTreesClassifier<br>from sklearn.cross_validation import train_test_split,cross_val_score<br>from sklearn import preprocessing<br><br>train = pd.read_csv(\'../input/train.csv\')<br>test = pd.read_csv(\'../input/test.csv\')<br><br>train_sample = np.random.choice(train.index.values,130000)   <br>train = train.ix[train_sample]<br><br># Converting date into datetime format<br>train[\'Date\'] = pd.to_datetime(pd.Series(train[\'Original_Quote_Date\']))<br># Dropping original date column<br>train = train.drop(\'Original_Quote_Date\', axis=1)   <br><br>test[\'Date\'] = pd.to_datetime(pd.Series(test[\'Original_Quote_Date\']))<br>test = test.drop(\'Original_Quote_Date\', axis=1)<br><br>## Seperating date into 3 columns<br>train[\'Year\'] = train[\'Date\'].apply(lambda x: int(str(x)[:4]))<br>train[\'Month\'] = train[\'Date\'].apply(lambda x: int(str(x)[5:7]))<br>train[\'weekday\'] = train[\'Date\'].dt.dayofweek<br><br>test[\'Year\'] = test[\'Date\'].apply(lambda x: int(str(x)[:4]))<br>test[\'Month\'] = test[\'Date\'].apply(lambda x: int(str(x)[5:7]))<br>test[\'weekday\'] = test[\'Date\'].dt.dayofweek <br><br>train = train.drop(\'Date\', axis=1)<br>test = test.drop(\'Date\', axis=1)    <br><br>## Filing NA values with -1<br><br>train = train.fillna(-1)<br>test = test.fillna(-1)<br>test_ori=test<br><br>y = train.QuoteConversion_Flag.values<br><br>#columns choice--gmm<br>train=DataFrame(train,columns=names)<br>test=DataFrame(test,columns=names)<br><br>for f in train.columns:<br>    if train[f].dtype==\'object\':<br>        print(f)<br>        lbl=preprocessing.LabelEncoder()<br>        lbl.fit(list(train[f].values)+list(test[f].values))<br>        train[f]=lbl.transform(list(train[f].values))<br>        test[f]=lbl.transform(list(test[f].values))<br><br>import xgboost as xgb<br><br>X_train = train<br>Y_train =y<br>X_test  = test<br><br>params = {""objective"": ""binary:logistic""}<br>T_train_xgb = xgb.DMatrix(X_train, Y_train)<br>X_test_xgb  = xgb.DMatrix(X_test)<br>gbm = xgb.train(params, T_train_xgb, 20)<br>Y_pred = gbm.predict(X_test_xgb)<br># Create submission<br>submission = pd.DataFrame()<br>submission[""QuoteNumber""]          = test_ori[""QuoteNumber""]<br>submission[""QuoteConversion_Flag""] = Y_pred<br>submission.to_csv(\'homesite.csv\', index=False)<br>      <br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>",No,2,23.0
"import pandas as pd
import numpy as np",No,5,22.0
"df_train_users = pd.read_csv('../input/train_users_2.csv')
df_test_users = pd.read_csv('../input/test_users.csv')",No,5,45.0
"# let\'s look at the destinations accounted for each occurence in the train set<br>df_train_users.groupby(""country_destination"").count()[""id""]",No,5,60.0
"# the 5 most frequent ""destinations"" are [""NDF"",""US"",""other"",""FR"",""IT""]<br># baseline: predict [""NDF"",""US"",""other"",""FR"",""IT""] for each user in the test set<br><br>res = [[x, destination] for x in df_test_users[""id""] for destination in [""NDF"",""US"",""other"",""FR"",""IT""]]<br>sub_baseline = pd.DataFrame(np.array(res), columns=[\'id\', \'country\'])<br>sub_baseline.to_csv(\'sub_baseline.csv\', index=False)",No,5,25.0
"import numpy as np<br>import pandas as pd<br>import math<br>from sklearn.metrics import log_loss<br>from sklearn.preprocessing import StandardScaler<br><br><br># https://github.com/sublee/elo/blob/master/elo.py<br>""""""<br>    elo<br>    ~~~<br>    The Elo rating system.<br>    :copyright: (c) 2012 by Heungsub Lee<br>    :license: BSD, see LICENSE for more details.<br>""""""<br>from datetime import datetime<br>import inspect<br><br>__version__  = \'0.1.dev\'<br>__all__ = [\'Elo\', \'Rating\', \'CountedRating\', \'TimedRating\', \'rate\', \'adjust\',<br>           \'expect\', \'rate_1vs1\', \'adjust_1vs1\', \'quality_1vs1\', \'setup\',<br>           \'global_env\', \'WIN\', \'DRAW\', \'LOSS\', \'K_FACTOR\', \'RATING_CLASS\',<br>           \'INITIAL\', \'BETA\']<br><br><br>#: The actual score for win.<br>WIN = 1.01<br>#: The actual score for draw.<br>DRAW = 0.5<br>#: The actual score for loss.<br>LOSS = 0.<br><br>#: Default K-factor.<br>K_FACTOR = 10<br>#: Default rating class.<br>RATING_CLASS = float<br>#: Default initial rating.<br>INITIAL = 1300<br>#: Default Beta value.<br>BETA = 170<br><br><br>class Rating(object):<br><br>    try:<br>        __metaclass__ = __import__(\'abc\').ABCMeta<br>    except ImportError:<br>        # for Python 2.5<br>        pass<br><br>    value = None<br><br>    def __init__(self, value=None):<br>        if value is None:<br>            value = global_env().initial<br>        self.value = value<br><br>    def rated(self, value):<br>        """"""Creates a :class:Rating object for the recalculated rating.<br>        :param value: the recalculated rating value.<br>        """"""<br>        return type(self)(value)<br><br>    def __int__(self):<br>        """"""Type-casting to int.""""""<br>        return int(self.value)<br><br>    def __long__(self):<br>        """"""Type-casting to long.""""""<br>        return long(self.value)<br><br>    def __float__(self):<br>        """"""Type-casting to float.""""""<br>        return float(self.value)<br><br>    def __nonzero__(self):<br>        """"""Type-casting to bool.""""""<br>        return bool(int(self))<br><br>    def __eq__(self, other):<br>        return float(self) == float(other)<br><br>    def __lt__(self, other):<br>        """"""Is Rating < number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        return self.value < other<br><br>    def __le__(self, other):<br>        """"""Is Rating <= number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        return self.value <= other<br><br>    def __gt__(self, other):<br>        """"""Is Rating > number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        return self.value > other<br><br>    def __ge__(self, other):<br>        """"""Is Rating >= number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        return self.value >= other<br><br>    def __iadd__(self, other):<br>        """"""Rating += number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        self.value += other<br>        return self<br><br>    def __isub__(self, other):<br>        """"""Rating -= number.<br>        :param other: the operand<br>        :type other: number<br>        """"""<br>        self.value -= other<br>        return self<br><br>    def __repr__(self):<br>        c = type(self)<br>        ext_params = inspect.getargspec(c.__init__)[0][2:]<br>        kwargs = \', \'.join(\'%s=%r\' % (param, getattr(self, param))<br>                           for param in ext_params)<br>        if kwargs:<br>            kwargs = \', \' + kwargs<br>        args = (\'.\'.join([c.__module__, c.__name__]), self.value, kwargs)<br>        return \'%s(%.3f%s)\' % args<br><br><br>try:<br>    Rating.register(float)<br>except AttributeError:<br>    pass<br><br><br>class CountedRating(Rating):<br>    """"""Increases count each rating recalculation.""""""<br><br>    times = None<br><br>    def __init__(self, value=None, times=0):<br>        self.times = times<br>        super(CountedRating, self).__init__(value)<br><br>    def rated(self, value):<br>        rated = super(CountedRating, self).rated(value)<br>        rated.times = self.times + 1<br>        return rated<br><br><br>class TimedRating(Rating):<br>    """"""Writes the final rated time.""""""<br><br>    rated_at = None<br><br>    def __init__(self, value=None, rated_at=None):<br>        self.rated_at = rated_at<br>        super(TimedRating, self).__init__(value)<br><br>    def rated(self, value):<br>        rated = super(TimedRating, self).rated(value)<br>        rated.rated_at = datetime.utcnow()<br>        return rated<br><br><br>class Elo(object):<br><br>    def __init__(self, k_factor=K_FACTOR, rating_class=RATING_CLASS,<br>                 initial=INITIAL, beta=BETA):<br>        self.k_factor = k_factor<br>        self.rating_class = rating_class<br>        self.initial = initial<br>        self.beta = beta<br><br>    def expect(self, rating, other_rating):<br>        """"""The ""E"" function in Elo. It calculates the expected score of the<br>        first rating by the second rating.<br>        """"""<br>        # http://www.chess-mind.com/en/elo-system<br>        diff = float(other_rating) - float(rating)<br>        f_factor = 2 * self.beta  # rating disparity<br>        return 1. / (1 + 11 ** (diff / f_factor))<br><br>    def adjust(self, rating, series):<br>        """"""Calculates the adjustment value.""""""<br>        return sum(score - self.expect(rating, other_rating)<br>                   for score, other_rating in series)<br><br>    def rate(self, rating, series):<br>        """"""Calculates new ratings by the game result series.""""""<br>        rating = self.ensure_rating(rating)<br>        k = self.k_factor(rating) if callable(self.k_factor) else self.k_factor<br>        new_rating = float(rating) + k * self.adjust(rating, series)<br>        if hasattr(rating, \'rated\'):<br>            new_rating = rating.rated(new_rating)<br>        return new_rating<br><br>    def adjust_1vs1(self, rating1, rating2, drawn=False):<br>        return self.adjust(rating1, [(DRAW if drawn else WIN, rating2)])<br><br>    def rate_1vs1(self, rating1, rating2, drawn=False):<br>        scores = (DRAW, DRAW) if drawn else (WIN, LOSS)<br>        return (self.rate(rating1, [(scores[0], rating2)]),<br>                self.rate(rating2, [(scores[1], rating1)]))<br><br>    def quality_1vs1(self, rating1, rating2):<br>        return 2 * (0.5 - abs(0.5 - self.expect(rating1, rating2)))<br><br>    def create_rating(self, value=None, *args, **kwargs):<br>        if value is None:<br>            value = self.initial<br>        return self.rating_class(value, *args, **kwargs)<br><br>    def ensure_rating(self, rating):<br>        if isinstance(rating, self.rating_class):<br>            return rating<br>        return self.rating_class(rating)<br><br>    def make_as_global(self):<br>        """"""Registers the environment as the global environment.<br>        >>> env = Elo(initial=2000)<br>        >>> Rating()<br>        elo.Rating(1200.000)<br>        >>> env.make_as_global()  #doctest: +ELLIPSIS<br>        elo.Elo(..., initial=2000.000, ...)<br>        >>> Rating()<br>        elo.Rating(2000.000)<br>        But if you need just one environment, use :func:setup instead.<br>        """"""<br>        return setup(env=self)<br><br>    def __repr__(self):<br>        c = type(self)<br>        rc = self.rating_class<br>        if callable(self.k_factor):<br>            f = self.k_factor<br>            k_factor = \'.\'.join([f.__module__, f.__name__])<br>        else:<br>            k_factor = \'%.3f\' % self.k_factor<br>        args = (\'.\'.join([c.__module__, c.__name__]), k_factor,<br>                \'.\'.join([rc.__module__, rc.__name__]), self.initial, self.beta)<br>        return (\'%s(k_factor=%s, rating_class=%s, \'<br>                \'initial=%.3f, beta=%.3f)\' % args)<br><br><br>def rate(rating, series):<br>    return global_env().rate(rating, series)<br><br><br>def adjust(rating, series):<br>    return global_env().adjust(rating, series)<br><br><br>def expect(rating, other_rating):<br>    return global_env().expect(rating, other_rating)<br><br><br>def rate_1vs1(rating1, rating2, drawn=False):<br>    return global_env().rate_1vs1(rating1, rating2, drawn)<br><br><br>def adjust_1vs1(rating1, rating2, drawn=False):<br>    return global_env().adjust_1vs1(rating1, rating2, drawn)<br><br><br>def quality_1vs1(rating1, rating2):<br>    return global_env().quality_1vs1(rating1, rating2)<br><br><br>def setup(k_factor=K_FACTOR, rating_class=RATING_CLASS,<br>          initial=INITIAL, beta=BETA, env=None):<br>    if env is None:<br>        env = Elo(k_factor, rating_class, initial, beta)<br>    global_env.__elo__ = env<br>    return env<br><br><br>def global_env():<br>    """"""Gets the global Elo environment.""""""<br>    try:<br>        global_env.__elo__<br>    except AttributeError:<br>        # setup the default environment<br>        setup()<br>    return global_env.__elo__<br># -------------------------------------------------------<br><br><br><br>def Outputs(data):<br>    return 1.-(1./(1.+np.exp(-data)))<br><br><br>def GPIndividual1(data):<br>    predictions = (np.sinh(((((np.sinh(data[""team1Seed""]) - data[""team2Seed""]) + ((np.tanh(data[""team2Wmax""]) + ((data[""team1Lstd""] + np.minimum( (data[""team1losses""]),  (data[""year""])))/2.0))/2.0)) + ((data[""team2Seed""] == (1.0/(1.0 + np.exp(- data[""team2wins""])))).astype(float)))/2.0)) +<br>                    ((np.cos(((np.round(data[""team2Wmedian""]) <= data[""team1LAverage""]).astype(float))) - np.maximum( ((data[""team1Seed""] * data[""team2Lstd""])),  (np.round(np.tanh(np.maximum( (np.maximum( (data[""team2Lstd""]),  (data[""team2Wstd""]))),  (data[""team1wins""]))))))) / 2.0) +<br>                    ((np.floor(np.minimum( (((1.732051 == data[""team1Lmax""]).astype(float))),  (np.cos(data[""team1WAverage""])))) == ((np.round(((((-(data[""team2LAverage""])) <= data[""team2losses""]).astype(float)) - 2.212120)) <= (data[""team2Wmin""] * data[""team2losses""])).astype(float))).astype(float)) +<br>                    np.minimum( ((np.abs(data[""team1Wmedian""]) - ((data[""team1Seed""] >= np.abs(data[""team1Lmedian""])).astype(float)))),  (np.round(np.sinh(np.minimum( ((((data[""team2WAverage""] != data[""team1Seed""]).astype(float)) + data[""team2WAverage""])),  ((-(((data[""team1Wmin""] >= 2.718282).astype(float)))))))))) +<br>                    ((np.minimum( (-1.0),  (data[""team2Lmax""])) > (data[""team2Wmax""] - np.minimum( (data[""team1Lmax""]),  ((data[""team2Wmin""] * (data[""team2Wmax""] - np.minimum( (data[""team2Wmin""]),  (np.tanh(np.sin((data[""team1Lmax""] * 2.0))))))))))).astype(float)) +<br>                    np.minimum( (((data[""team2WAverage""] >= np.floor(data[""team2Wmin""])).astype(float))),  (np.abs(((data[""team1Seed""] >= np.sinh(((0.693147 > np.minimum( (data[""team2Wmedian""]),  (((data[""team1Seed""] <= ((data[""team1Wmedian""] <= np.cos(0.693147)).astype(float))).astype(float))))).astype(float)))).astype(float))))) +<br>                    np.sin(np.sinh(((((-((((-(0.367879)) <= data[""team1Lmax""]).astype(float)))) + ((data[""team1Wmin""] >= np.floor(data[""team1Seed""])).astype(float)))/2.0) - ((np.sin(data[""team2Wstd""]) > (data[""team2Wmax""] + np.abs(data[""team2""]))).astype(float))))) +<br>                    (((((np.sin(data[""team1Lmax""]) > data[""team1Wstd""]).astype(float)) != ((data[""team1Lmax""] == data[""team2wins""]).astype(float))).astype(float)) * (((-(data[""team2Wmin""])) + ((data[""team1Lstd""] + np.minimum( (data[""team2wins""]),  (np.minimum( (data[""team1Lmax""]),  (data[""team2Wmin""])))))/2.0))/2.0)) +<br>                    np.maximum( (np.minimum( (data[""team1Wmin""]),  (np.ceil(np.minimum( (np.minimum( (0.138462),  (((data[""team1Seed""] >= data[""team2Lmedian""]).astype(float))))),  (data[""team2losses""])))))),  ((((-(np.maximum( (data[""team2""]),  (data[""team2Seed""])))) > ((data[""team1losses""] < 1.414214).astype(float))).astype(float)))) +<br>                    np.minimum( (np.maximum( (data[""team1Lmin""]),  ((-(((data[""team1wins""] >= np.cos(0.720430)).astype(float))))))),  (np.minimum( (np.ceil((data[""team1Wmedian""] / 2.0))),  ((-(((data[""team1Wmin""] >= (1.197370 + ((data[""team1Wstd""] < 0.094340).astype(float)))).astype(float)))))))) +<br>                    ((((-(np.abs(np.abs(((data[""team1""] + (-(0.138462)))/2.0))))) * ((0.367879 >= data[""team2wins""]).astype(float))) > ((data[""team1Wmedian""] + np.maximum( (data[""team1""]),  (((0.367879 != data[""team1""]).astype(float)))))/2.0)).astype(float)) +<br>                    ((3.0 == np.maximum( (np.round(np.maximum( (np.sinh(data[""team2Lmin""])),  (data[""team2LAverage""])))),  (np.floor(np.maximum( (np.sinh(np.maximum( ((data[""team1""] * 2.0)),  (data[""team1Wmedian""])))),  (np.sinh((data[""team2Wmedian""] * np.sin(data[""team2WAverage""]))))))))).astype(float)) +<br>                    np.minimum( (np.ceil(((data[""team2Wmin""] + ((0.094340 >= data[""team2""]).astype(float)))/2.0))),  ((np.minimum( ((data[""team1Lmax""] * data[""team1Wmedian""])),  (((data[""team1Wmedian""] < (data[""team1Wmax""] - data[""team2Lmedian""])).astype(float)))) * np.maximum( (data[""team2Seed""]),  (data[""team1Wmax""]))))) +<br>                    ((-(((data[""team2Wmin""] >= ((-((data[""team2Wstd""] + 0.318310))) * (1.0/(1.0 + np.exp(- (-((1.0/(1.0 + np.exp(- 0.318310)))))))))).astype(float)))) * (((1.0/(1.0 + np.exp(- data[""year""]))) * data[""team1Lmin""]) * data[""team2Lstd""])) +<br>                    np.floor(np.cos(((data[""team1WAverage""] * np.minimum( ((data[""team1WAverage""] * data[""team2Lmax""])),  (data[""team1Lstd""]))) * ((np.sin(np.round(data[""team2Lmax""])) + ((data[""team1LAverage""] != data[""team2WAverage""]).astype(float)))/2.0)))) +<br>                    np.ceil(((2.675680 <= np.abs(np.maximum( ((data[""team2""] + np.maximum( (np.abs(data[""team1Seed""])),  (data[""team2LAverage""])))),  ((0.318310 + (np.minimum( (data[""team1Wmedian""]),  (((data[""team2""] != data[""team2LAverage""]).astype(float)))) - data[""team2Lmin""])))))).astype(float))) +<br>                    ((np.sinh(np.sin(data[""team2Lstd""])) * np.round(np.minimum( (np.sin(np.sinh(np.round(data[""team1Wmin""])))),  ((np.minimum( (data[""team1Lstd""]),  (np.sin(data[""team2Lstd""]))) * np.sin(data[""team2Lstd""])))))) / 2.0) +<br>                    ((((data[""team2""] <= ((((data[""team1Lmin""] + 5.428570)/2.0) + (np.cos(np.maximum( (data[""team1Lmin""]),  (np.maximum( ((1.0/(1.0 + np.exp(- data[""team1Wmin""])))),  (data[""team2LAverage""]))))) / 2.0))/2.0)).astype(float)) <= ((((data[""team2""] + 5.428570)/2.0) <= data[""team1Wmedian""]).astype(float))).astype(float)) +<br>                    np.floor(np.cos((data[""team2Lmin""] * np.minimum( ((data[""team1wins""] + ((data[""year""] + data[""team2Wmedian""])/2.0))),  ((np.minimum( (data[""year""]),  (data[""team1wins""])) + ((((data[""year""] > data[""team2Wmin""]).astype(float)) + data[""team2Wmedian""])/2.0))))))) +<br>                    np.minimum( ((((np.minimum( (((data[""team1Wmedian""] >= data[""team1WAverage""]).astype(float))),  (data[""team2losses""])) / 2.0) > data[""team1Lmax""]).astype(float))),  (((data[""team1Wmedian""] >= (1.0/(1.0 + np.exp(- ((data[""team1Wmax""] > ((data[""team1WAverage""] < (data[""team1Wmedian""] - data[""team2Lmin""])).astype(float))).astype(float)))))).astype(float)))) +<br>                    (((0.602941 <= (data[""team2Wmin""] - ((((np.cos(data[""team1Wmedian""]) * 2.0) * 2.0) >= 1.570796).astype(float)))).astype(float)) * np.sin(np.sinh(np.sinh(data[""team2Wstd""])))) +<br>                    (data[""team1losses""] * ((data[""team1Wmedian""] >= (np.tanh((((((-(data[""team1Lmin""])) > 0.434294).astype(float)) != ((np.minimum( (data[""team1wins""]),  (((0.434294 <= data[""team2losses""]).astype(float)))) < data[""team1Wmedian""]).astype(float))).astype(float))) * 2.0)).astype(float))) +<br>                    np.maximum( (((np.minimum( (data[""team1Lmedian""]),  (((data[""team2""] > np.maximum( (data[""team2losses""]),  (0.585714))).astype(float)))) >= (data[""team1WAverage""] + ((1.414214 > data[""team2Lmin""]).astype(float)))).astype(float))),  (((data[""team2Wmin""] < (data[""team1wins""] - 3.141593)).astype(float)))) +<br>                    np.round((np.round(((data[""team2Lmedian""] * ((data[""year""] > ((2.675680 + ((data[""team1LAverage""] <= np.maximum( (data[""team1WAverage""]),  (0.094340))).astype(float)))/2.0)).astype(float))) * 2.0)) * 2.0)) +<br>                    ((np.abs(np.sinh(np.abs(data[""team2Lstd""]))) <= (data[""team1Lmax""] * ((data[""team2losses""] <= (-(((((data[""team2Lstd""] <= np.minimum( (data[""team2wins""]),  (data[""team2losses""]))).astype(float)) < np.maximum( (data[""team2Wmedian""]),  (data[""team1losses""]))).astype(float))))).astype(float)))).astype(float)) +<br>                    np.minimum( (np.cos(data[""team1""])),  (((((((((((data[""team1""] / 2.0) / 2.0) * 9.869604) <= 0.058823).astype(float)) <= data[""team1Wstd""]).astype(float)) == np.ceil(((data[""team1""] / 2.0) * 9.869604))).astype(float)) - 0.094340))) +<br>                    np.maximum( (np.round(((2.212120 <= (data[""team1Wmax""] - ((data[""team2Lmin""] + data[""team2Lmedian""])/2.0))).astype(float)))),  (((3.0 < (data[""team2losses""] + np.maximum( ((-(((data[""team2Lmin""] + data[""team2LAverage""])/2.0)))),  (data[""team1""])))).astype(float)))) +<br>                    ((data[""team2wins""] - np.sin(data[""team2Wmin""])) * ((np.maximum( (data[""team2wins""]),  (0.840000)) <= np.minimum( (data[""team1Lmax""]),  ((np.maximum( (data[""team2Wmax""]),  ((data[""team2wins""] * np.floor(data[""team2Wmax""])))) - 0.058823)))).astype(float))) +<br>                    ((math.tanh((-(1.630430))) > np.sin(np.maximum( (data[""team2Wmin""]),  (np.minimum( (np.minimum( (data[""team2Seed""]),  (((data[""team1LAverage""] + data[""team1Wstd""])/2.0)))),  ((((data[""team2Seed""] <= data[""team2Wmin""]).astype(float)) - data[""team2Lstd""]))))))).astype(float)) +<br>                    np.floor(np.cos(((1.570796 + (np.minimum( (data[""team1LAverage""]),  (((data[""team1WAverage""] <= ((((((data[""team1Wmin""] + data[""team1WAverage""])/2.0) < ((data[""team2Seed""] >= data[""team1Seed""]).astype(float))).astype(float)) + (data[""team2Lmedian""] * 0.636620))/2.0)).astype(float)))) * 2.0))/2.0))) +<br>                    ((data[""team2Wmin""] > ((0.318310 + (((1.0/(1.0 + np.exp(- (((data[""team2Seed""] * np.maximum( (data[""team2""]),  (data[""team1Lmax""]))) <= ((data[""team2losses""] < data[""team1Lmax""]).astype(float))).astype(float))))) < data[""team2Wmin""]).astype(float))) * 2.0)).astype(float)) +<br>                    np.sinh(np.floor((0.367879 - (((((np.minimum( (data[""team1LAverage""]),  (np.floor(data[""team2WAverage""]))) == ((2.409090 < -3.0))).astype(float)) + (data[""team2wins""] * np.sin(np.minimum( (data[""team1Lmax""]),  (data[""team2WAverage""])))))/2.0) / 2.0)))) +<br>                    (((data[""team1Wmax""] < (-2.0 + ((data[""team1wins""] < ((data[""team1Wstd""] - (((data[""team1Wstd""] * data[""team1Wstd""]) < data[""team2Wstd""]).astype(float))) - np.sinh((((data[""team2Wmax""] > data[""team2Lstd""]).astype(float)) * 2.0)))).astype(float)))).astype(float)) * 2.0) +<br>                    np.tanh(np.sin(np.round(np.tanh((data[""team2Wmax""] * ((np.round(data[""team2LAverage""]) == ((((data[""team1Wmin""] < data[""team1LAverage""]).astype(float)) > ((data[""team2Wmin""] >= np.cos(np.minimum( (data[""team2Wmax""]),  (data[""team2LAverage""])))).astype(float))).astype(float))).astype(float))))))) +<br>                    np.minimum( (np.cos(data[""team1losses""])),  (((1.197370 < (data[""team2""] * ((data[""team1Lmax""] + np.round(((data[""team1Wstd""] - ((((data[""team1Lmax""] / 2.0) > data[""team2Wmax""]).astype(float)) / 2.0)) / 2.0)))/2.0))).astype(float)))) +<br>                    np.abs(((((data[""team1WAverage""] > data[""team2Wstd""]).astype(float)) * 2.0) * (np.tanh(1.732051) * ((((data[""team1wins""] <= 1.732051).astype(float)) < ((np.cos(data[""team2Lmedian""]) > np.abs(np.sin((data[""team2losses""] * 2.0)))).astype(float))).astype(float))))) +<br>                    np.minimum( (np.cos((data[""team1Wmin""] * data[""team2WAverage""]))),  (np.minimum( (((-(((np.abs(data[""team1WAverage""]) > 1.414214).astype(float)))) / 2.0)),  (np.cos(np.maximum( (data[""team1Wmax""]),  ((data[""team1wins""] - data[""team2Wmedian""])))))))) +<br>                    np.abs(np.minimum( (np.minimum( (((np.abs(data[""team1Lmax""]) > ((1.732051 > (data[""team1Wmedian""] + data[""team1""])).astype(float))).astype(float))),  (np.cos(np.minimum( (data[""team2wins""]),  ((-(np.abs(data[""team1Lmax""]))))))))),  (np.cos(data[""team1LAverage""])))) +<br>                    ((((((data[""team1WAverage""] >= ((data[""team1Lmax""] < np.sin(1.584910)).astype(float))).astype(float)) < data[""team1Wstd""]).astype(float)) * ((2.302585 < data[""team2Wmedian""]).astype(float))) * 2.0) +<br>                    (-(((((((np.ceil(np.minimum( (data[""team2Wmax""]),  (data[""team2Wstd""]))) >= (-(data[""team2Lmin""]))).astype(float)) > ((data[""team2Seed""] <= np.ceil((data[""team1Seed""] / 2.0))).astype(float))).astype(float)) + ((5.200000 <= np.floor(data[""team2Wmax""])).astype(float)))/2.0))) +<br>                    np.minimum( (((data[""year""] > data[""team1Lmedian""]).astype(float))),  (np.minimum( (((data[""team2Wmedian""] < np.cos(data[""team1""])).astype(float))),  ((np.maximum( (data[""team1Lmin""]),  ((np.round(data[""team2Lmedian""]) + np.round(np.round(data[""team2Lmedian""]))))) / 2.0))))) +<br>                    ((np.minimum( (np.minimum( (((data[""team1Lmin""] <= np.cos(data[""team2Wmin""])).astype(float))),  (data[""team2losses""]))),  (data[""team1Seed""])) >= ((((data[""team1LAverage""] < np.cos(((data[""team2Wmin""] < data[""team1LAverage""]).astype(float)))).astype(float)) != ((data[""team1Lmin""] <= np.cos(2.675680)).astype(float))).astype(float))).astype(float)) +<br>                    (np.minimum( (np.cos(data[""team2Seed""])),  (np.floor(np.cos((data[""team1losses""] * ((1.0/(1.0 + np.exp(- (-(((((data[""team1""] <= np.cos(data[""team2Seed""])).astype(float)) >= np.maximum( (data[""team1Wstd""]),  (data[""team1Wmedian""]))).astype(float))))))) * 2.0)))))) * 2.0) +<br>                    (3.141593 * (3.141593 * ((np.tanh(data[""team1Wmin""]) >= (((data[""team1losses""] < ((3.141593 + ((np.round(data[""team2Wmin""]) <= (data[""team1losses""] * data[""team2Wmax""])).astype(float)))/2.0)).astype(float)) * 2.0)).astype(float)))) +<br>                    np.tanh((data[""team1Lmax""] * (-(((data[""team2Wmax""] > (1.197370 - (((((data[""team2LAverage""] > 1.197370).astype(float)) / 2.0) == ((((((data[""team2Wmax""] != data[""team1Wstd""]).astype(float)) > data[""team1""]).astype(float)) > data[""team1Lmin""]).astype(float))).astype(float)))).astype(float)))))) +<br>                    ((np.minimum( (data[""team1wins""]),  (np.minimum( (data[""team1wins""]),  (data[""team1Wmax""])))) > np.abs((((((data[""team1Wmax""] + data[""team1Wmax""])/2.0) * (data[""team2Lmedian""] * data[""team1LAverage""])) < np.cos(np.minimum( (data[""team2Seed""]),  (data[""team1Lmedian""])))).astype(float)))).astype(float)) +<br>                    (-(np.maximum( (((data[""team1WAverage""] > (np.abs(data[""team1Lmin""]) + 2.212120)).astype(float))),  (np.minimum( ((((1.0/(1.0 + math.exp(- 0.693147))) <= (-(data[""team1Lstd""]))).astype(float))),  ((data[""team2Lmin""] * 2.212120))))))) +<br>                    (np.minimum( (0.585714),  (np.maximum( (data[""team2Wmax""]),  (np.ceil(data[""team1WAverage""]))))) * ((np.cos(data[""team2Lmin""]) < ((((2.0 > data[""team2wins""]).astype(float)) <= (data[""team1Lmin""] * ((data[""team2Lmin""] > data[""team2wins""]).astype(float)))).astype(float))).astype(float))) +<br>                    np.floor(np.cos((data[""team1WAverage""] * np.maximum( (data[""team2Lmax""]),  (np.maximum( (((data[""team1Wstd""] + ((data[""team1Lstd""] + data[""team2Lstd""])/2.0))/2.0)),  (np.sin(((((data[""team2Lstd""] <= data[""team1Lstd""]).astype(float)) + -2.0)/2.0))))))))) +<br>                    (((((np.round(data[""team2Lmax""]) >= ((((1.584910 <= data[""team1wins""]).astype(float)) >= ((((data[""team2Lmax""] <= 2.718282).astype(float)) >= data[""team1Lstd""]).astype(float))).astype(float))).astype(float)) < np.minimum( (data[""team2Lmax""]),  ((1.630430 + data[""team1""])))).astype(float)) / 2.0) +<br>                    np.sin(np.minimum( ((data[""team1Wmedian""] * (3.141593 * np.sinh(np.maximum( (data[""team1""]),  (data[""team2Lstd""])))))),  ((-3.0 * ((((data[""team1""] >= data[""team1LAverage""]).astype(float)) >= ((data[""team1Wstd""] > data[""team1WAverage""]).astype(float))).astype(float)))))) +<br>                    ((0.094340 >= np.abs(np.cos((data[""team1Wmax""] - (((((data[""team1Lmedian""] >= ((data[""team1""] >= (-(np.ceil(data[""team2Lmax""])))).astype(float))).astype(float)) != np.ceil(np.ceil(data[""team2Wmin""]))).astype(float)) * data[""team1Lmedian""]))))).astype(float)) +<br>                    ((np.abs(data[""team1Wmax""]) <= (data[""team1Seed""] - np.maximum( (np.abs(data[""year""])),  (((((np.abs(data[""year""]) > np.maximum( ((data[""team1Wmin""] * 2.0)),  (data[""team2wins""]))).astype(float)) < np.ceil(np.abs(data[""team2Wmin""]))).astype(float)))))).astype(float)) +<br>                    (((-2.0 < data[""team2""]).astype(float)) * np.abs((((data[""team2""] < data[""team1Seed""]).astype(float)) * (((data[""team2losses""] <= (-2.0 / 2.0)).astype(float)) + ((data[""team2Lstd""] * ((1.570796 < data[""year""]).astype(float))) * 2.0))))) +<br>                    np.minimum( ((((data[""team2Lstd""] * ((data[""team1Wmax""] > 1.0).astype(float))) + (-((((np.maximum( (data[""team1Lmin""]),  (data[""team1Wmedian""])) <= data[""team2Lstd""]).astype(float)) / 2.0))))/2.0)),  (((data[""team1LAverage""] < ((data[""team1LAverage""] >= 0.602941).astype(float))).astype(float)))) +<br>                    (-(((((data[""team1Lmin""] <= 0.840000).astype(float)) <= (-((data[""team2Lmedian""] * (data[""team2Wmax""] + (data[""team2Lmin""] + ((data[""team2Wstd""] < (data[""team2Lmedian""] * np.sinh(data[""team2Lmin""]))).astype(float)))))))).astype(float)))) +<br>                    ((np.minimum( (data[""team2wins""]),  (data[""team1Wstd""])) > ((1.197370 >= (np.minimum( ((data[""team1Wmin""] * data[""team1Wstd""])),  (data[""team1Wstd""])) - np.minimum( (np.minimum( (data[""team2Lmax""]),  (data[""team1Lmin""]))),  (data[""team2Wmax""])))).astype(float))).astype(float)) +<br>                    (np.cos((-(data[""team2Wmin""]))) * (((data[""team2losses""] > data[""team1Lmedian""]).astype(float)) * np.sin((data[""team2Lstd""] * np.minimum( (np.sinh(data[""team1""])),  ((8.0 - data[""team2losses""]))))))) +<br>                    (((-(data[""team1WAverage""])) >= (3.0 * np.maximum( (np.maximum( (data[""team1Lmax""]),  (data[""team2Lstd""]))),  (((data[""team2""] < ((1.0/(1.0 + np.exp(- np.tanh((1.0/(1.0 + np.exp(- ((data[""team2Wstd""] > data[""team1LAverage""]).astype(float))))))))) * 2.0)).astype(float)))))).astype(float)) +<br>                    np.sinh(np.sinh(((np.maximum( (data[""team2losses""]),  ((data[""team2Wstd""] * data[""year""]))) > ((3.0 - np.cos(((((data[""team2Wstd""] > 2.409090).astype(float)) <= data[""team1Wmin""]).astype(float)))) - ((data[""team2Wstd""] > 2.409090).astype(float)))).astype(float)))) +<br>                    np.sinh((-(((((((-(((((data[""team1Seed""] < data[""team2Lstd""]).astype(float)) < data[""team2Wmin""]).astype(float)))) >= np.cos((-(data[""team2""])))).astype(float)) >= ((data[""team1Seed""] < (2.0 - data[""team1losses""])).astype(float))).astype(float)) / 2.0)))) +<br>                    (np.minimum( (((data[""team2WAverage""] > data[""team1Wmax""]).astype(float))),  (((-(((data[""team1""] - (-(((data[""team1wins""] >= ((0.094340 > data[""team1Wmax""]).astype(float))).astype(float))))) - ((0.094340 > data[""team1Wmax""]).astype(float))))) / 2.0))) / 2.0) +<br>                    (((data[""team1""] >= (1.0/(1.0 + np.exp(- np.tanh(data[""team1Wstd""]))))).astype(float)) * np.tanh(((data[""team1Lmin""] > ((1.0/(1.0 + np.exp(- np.tanh(data[""team1Lmin""])))) * np.minimum( ((((1.0/(1.0 + np.exp(- data[""team1""]))) + data[""team1Wstd""])/2.0)),  (data[""team2""])))).astype(float)))) +<br>                    (((((data[""team2Wmax""] * data[""team2Wmin""]) >= ((np.floor(data[""team2Wmin""]) == ((data[""team1WAverage""] <= data[""team1WAverage""]).astype(float))).astype(float))).astype(float)) + (-(np.round(np.sin((1.0/(1.0 + np.exp(- np.abs((data[""team1Wmin""] - np.cos(data[""team2Wmin""])))))))))))/2.0))<br><br>    return Outputs(predictions)<br><br><br>def Aggregate(teamcompactresults1,<br>              teamcompactresults2,<br>              merged_results,<br>              regularseasoncompactresults):<br>    winningteam1compactresults = pd.merge(how=\'left\',<br>                                          left=teamcompactresults1,<br>                                          right=regularseasoncompactresults,<br>                                          left_on=[\'year\', \'team1\'],<br>                                          right_on=[\'Season\', \'Wteam\'])<br>    winningteam1compactresults.drop([\'Season\',<br>                                     \'Daynum\',<br>                                     \'Wteam\',<br>                                     \'Lteam\',<br>                                     \'Lscore\',<br>                                     \'Wloc\',<br>                                     \'Numot\'],<br>                                    inplace=True,<br>                                    axis=1)<br>    grpwinningteam1resultsaverage =  \\<br>        winningteam1compactresults.groupby([\'year\', \'team1\']).mean()<br>    winningteam1resultsaverage = grpwinningteam1resultsaverage.reset_index()<br>    winningteam1resultsaverage.rename(columns={\'Wscore\': \'team1WAverage\'},<br>                                      inplace=True)<br>    grpwinningteam1resultsmin =  \\<br>        winningteam1compactresults.groupby([\'year\', \'team1\']).min()<br>    winningteam1resultsmin = grpwinningteam1resultsmin.reset_index()<br>    winningteam1resultsmin.rename(columns={\'Wscore\': \'team1Wmin\'},<br>                                  inplace=True)<br>    grpwinningteam1resultsmax =  \\<br>        winningteam1compactresults.groupby([\'year\', \'team1\']).max()<br>    winningteam1resultsmax = grpwinningteam1resultsmax.reset_index()<br>    winningteam1resultsmax.rename(columns={\'Wscore\': \'team1Wmax\'},<br>                                  inplace=True)<br>    grpwinningteam1resultsmedian =  \\<br>        winningteam1compactresults.groupby([\'year\', \'team1\']).median()<br>    winningteam1resultsmedian = grpwinningteam1resultsmedian.reset_index()<br>    winningteam1resultsmedian.rename(columns={\'Wscore\': \'team1Wmedian\'},<br>                                     inplace=True)<br>    grpwinningteam1resultsstd =  \\<br>        winningteam1compactresults.groupby([\'year\', \'team1\']).std()<br>    winningteam1resultsstd = grpwinningteam1resultsstd.reset_index()<br>    winningteam1resultsstd.rename(columns={\'Wscore\': \'team1Wstd\'},<br>                                  inplace=True)<br>    losingteam1compactresults = pd.merge(how=\'left\',<br>                                         left=teamcompactresults1,<br>                                         right=regularseasoncompactresults,<br>                                         left_on=[\'year\', \'team1\'],<br>                                         right_on=[\'Season\', \'Lteam\'])<br>    losingteam1compactresults.drop([\'Season\',<br>                                    \'Daynum\',<br>                                    \'Wteam\',<br>                                    \'Lteam\',<br>                                    \'Wscore\',<br>                                    \'Wloc\',<br>                                    \'Numot\'],<br>                                   inplace=True,<br>                                   axis=1)<br>    grplosingteam1resultsaverage = \\<br>        losingteam1compactresults.groupby([\'year\', \'team1\']).mean()<br>    losingteam1resultsaverage = grplosingteam1resultsaverage.reset_index()<br>    losingteam1resultsaverage.rename(columns={\'Lscore\': \'team1LAverage\'},<br>                                     inplace=True)<br>    grplosingteam1resultsmin = \\<br>        losingteam1compactresults.groupby([\'year\', \'team1\']).min()<br>    losingteam1resultsmin = grplosingteam1resultsmin.reset_index()<br>    losingteam1resultsmin.rename(columns={\'Lscore\': \'team1Lmin\'},<br>                                 inplace=True)<br>    grplosingteam1resultsmax = \\<br>        losingteam1compactresults.groupby([\'year\', \'team1\']).max()<br>    losingteam1resultsmax = grplosingteam1resultsmax.reset_index()<br>    losingteam1resultsmax.rename(columns={\'Lscore\': \'team1Lmax\'},<br>                                 inplace=True)<br>    grplosingteam1resultsmedian = \\<br>        losingteam1compactresults.groupby([\'year\', \'team1\']).median()<br>    losingteam1resultsmedian = grplosingteam1resultsmedian.reset_index()<br>    losingteam1resultsmedian.rename(columns={\'Lscore\': \'team1Lmedian\'},<br>                                    inplace=True)<br>    grplosingteam1resultsstd = \\<br>        losingteam1compactresults.groupby([\'year\', \'team1\']).std()<br>    losingteam1resultsstd = grplosingteam1resultsstd.reset_index()<br>    losingteam1resultsstd.rename(columns={\'Lscore\': \'team1Lstd\'},<br>                                 inplace=True)<br>    winningteam2compactresults = pd.merge(how=\'left\',<br>                                          left=teamcompactresults2,<br>                                          right=regularseasoncompactresults,<br>                                          left_on=[\'year\', \'team2\'],<br>                                          right_on=[\'Season\', \'Wteam\'])<br>    winningteam2compactresults.drop([\'Season\',<br>                                     \'Daynum\',<br>                                     \'Wteam\',<br>                                     \'Lteam\',<br>                                     \'Lscore\',<br>                                     \'Wloc\',<br>                                     \'Numot\'],<br>                                    inplace=True,<br>                                    axis=1)<br>    grpwinningteam2resultsaverage = \\<br>        winningteam2compactresults.groupby([\'year\', \'team2\']).mean()<br>    winningteam2resultsaverage = grpwinningteam2resultsaverage.reset_index()<br>    winningteam2resultsaverage.rename(columns={\'Wscore\': \'team2WAverage\'},<br>                                      inplace=True)<br>    grpwinningteam2resultsmin = \\<br>        winningteam2compactresults.groupby([\'year\', \'team2\']).min()<br>    winningteam2resultsmin = grpwinningteam2resultsmin.reset_index()<br>    winningteam2resultsmin.rename(columns={\'Wscore\': \'team2Wmin\'},<br>                                  inplace=True)<br>    grpwinningteam2resultsmax = \\<br>        winningteam2compactresults.groupby([\'year\', \'team2\']).max()<br>    winningteam2resultsmax = grpwinningteam2resultsmax.reset_index()<br>    winningteam2resultsmax.rename(columns={\'Wscore\': \'team2Wmax\'},<br>                                  inplace=True)<br>    grpwinningteam2resultsmedian = \\<br>        winningteam2compactresults.groupby([\'year\', \'team2\']).median()<br>    winningteam2resultsmedian = grpwinningteam2resultsmedian.reset_index()<br>    winningteam2resultsmedian.rename(columns={\'Wscore\': \'team2Wmedian\'},<br>                                     inplace=True)<br>    grpwinningteam2resultsstd = \\<br>        winningteam2compactresults.groupby([\'year\', \'team2\']).std()<br>    winningteam2resultsstd = grpwinningteam2resultsstd.reset_index()<br>    winningteam2resultsstd.rename(columns={\'Wscore\': \'team2Wstd\'},<br>                                  inplace=True)<br>    losingteam2compactresults = pd.merge(how=\'left\',<br>                                         left=teamcompactresults2,<br>                                         right=regularseasoncompactresults,<br>                                         left_on=[\'year\', \'team2\'],<br>                                         right_on=[\'Season\', \'Lteam\'])<br>    losingteam2compactresults.drop([\'Season\',<br>                                    \'Daynum\',<br>                                    \'Wteam\',<br>                                    \'Lteam\',<br>                                    \'Wscore\',<br>                                    \'Wloc\',<br>                                    \'Numot\'],<br>                                   inplace=True,<br>                                   axis=1)<br>    grplosingteam2resultsaverage = \\<br>        losingteam2compactresults.groupby([\'year\', \'team2\']).mean()<br>    losingteam2resultsaverage = grplosingteam2resultsaverage.reset_index()<br>    losingteam2resultsaverage.rename(columns={\'Lscore\': \'team2LAverage\'},<br>                                     inplace=True)<br>    grplosingteam2resultsmin = \\<br>        losingteam2compactresults.groupby([\'year\', \'team2\']).min()<br>    losingteam2resultsmin = grplosingteam2resultsmin.reset_index()<br>    losingteam2resultsmin.rename(columns={\'Lscore\': \'team2Lmin\'},<br>                                 inplace=True)<br>    grplosingteam2resultsmax = \\<br>        losingteam2compactresults.groupby([\'year\', \'team2\']).max()<br>    losingteam2resultsmax = grplosingteam2resultsmax.reset_index()<br>    losingteam2resultsmax.rename(columns={\'Lscore\': \'team2Lmax\'},<br>                                 inplace=True)<br>    grplosingteam2resultsmedian = \\<br>        losingteam2compactresults.groupby([\'year\', \'team2\']).median()<br>    losingteam2resultsmedian = grplosingteam2resultsmedian.reset_index()<br>    losingteam2resultsmedian.rename(columns={\'Lscore\': \'team2Lmedian\'},<br>                                    inplace=True)<br>    grplosingteam2resultsstd = \\<br>        losingteam2compactresults.groupby([\'year\', \'team2\']).std()<br>    losingteam2resultsstd = grplosingteam2resultsstd.reset_index()<br>    losingteam2resultsstd.rename(columns={\'Lscore\': \'team2Lstd\'},<br>                                 inplace=True)<br>    agg_results = pd.merge(how=\'left\',<br>                           left=merged_results,<br>                           right=winningteam1resultsaverage,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam1resultsaverage,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam1resultsmin,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam1resultsmin,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam1resultsmax,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam1resultsmax,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam1resultsmedian,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam1resultsmedian,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam1resultsstd,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam1resultsstd,<br>                           left_on=[\'year\', \'team1\'],<br>                           right_on=[\'year\', \'team1\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam2resultsaverage,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam2resultsaverage,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam2resultsmin,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam2resultsmin,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam2resultsmax,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam2resultsmax,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam2resultsmedian,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam2resultsmedian,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=winningteam2resultsstd,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    agg_results = pd.merge(how=\'left\',<br>                           left=agg_results,<br>                           right=losingteam2resultsstd,<br>                           left_on=[\'year\', \'team2\'],<br>                           right_on=[\'year\', \'team2\'])<br>    return agg_results<br><br><br>def GrabData():<br>    tourneyresults = pd.read_csv(\'../input/TourneyCompactResults.csv\')<br>    tourneyseeds = pd.read_csv(\'../input/TourneySeeds.csv\')<br>    regularseasoncompactresults = \\<br>        pd.read_csv(\'../input/RegularSeasonCompactResults.csv\')<br>    sample = pd.read_csv(\'../input/SampleSubmission.csv\')<br>    results = pd.DataFrame()<br>    results[\'year\'] = tourneyresults.Season<br>    results[\'team1\'] = np.minimum(tourneyresults.Wteam, tourneyresults.Lteam)<br>    results[\'team2\'] = np.maximum(tourneyresults.Wteam, tourneyresults.Lteam)<br>    results[\'result\'] = (tourneyresults.Wteam <<br>                         tourneyresults.Lteam).astype(int)<br>    merged_results = pd.merge(left=results,<br>                              right=tourneyseeds,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'Season\', \'Team\'])<br>    merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)<br>    merged_results.rename(columns={\'Seed\': \'team1Seed\'}, inplace=True)<br>    merged_results = pd.merge(left=merged_results,<br>                              right=tourneyseeds,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'Season\', \'Team\'])<br>    merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)<br>    merged_results.rename(columns={\'Seed\': \'team2Seed\'}, inplace=True)<br>    merged_results[\'team1Seed\'] = \\<br>        merged_results[\'team1Seed\'].apply(lambda x: str(x)[1:3])<br>    merged_results[\'team2Seed\'] = \\<br>        merged_results[\'team2Seed\'].apply(lambda x: str(x)[1:3])<br>    merged_results = merged_results.astype(int)<br>    winsbyyear = regularseasoncompactresults[[\'Season\', \'Wteam\']].copy()<br>    winsbyyear[\'wins\'] = 1<br>    wins = winsbyyear.groupby([\'Season\', \'Wteam\']).sum()<br>    wins = wins.reset_index()<br>    lossesbyyear = regularseasoncompactresults[[\'Season\', \'Lteam\']].copy()<br>    lossesbyyear[\'losses\'] = 1<br>    losses = lossesbyyear.groupby([\'Season\', \'Lteam\']).sum()<br>    losses = losses.reset_index()<br>    winsteam1 = wins.copy()<br>    winsteam1.rename(columns={\'Season\': \'year\',<br>                              \'Wteam\': \'team1\',<br>                              \'wins\': \'team1wins\'}, inplace=True)<br>    winsteam2 = wins.copy()<br>    winsteam2.rename(columns={\'Season\': \'year\',<br>                              \'Wteam\': \'team2\',<br>                              \'wins\': \'team2wins\'}, inplace=True)<br>    lossesteam1 = losses.copy()<br>    lossesteam1.rename(columns={\'Season\': \'year\',<br>                                \'Lteam\': \'team1\',<br>                                \'losses\': \'team1losses\'}, inplace=True)<br>    lossesteam2 = losses.copy()<br>    lossesteam2.rename(columns={\'Season\': \'year\',<br>                                \'Lteam\': \'team2\',<br>                                \'losses\': \'team2losses\'}, inplace=True)<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=winsteam1,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'year\', \'team1\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=lossesteam1,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'year\', \'team1\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=winsteam2,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'year\', \'team2\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=lossesteam2,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'year\', \'team2\'])<br>    teamcompactresults1 = merged_results[[\'year\', \'team1\']].copy()<br>    teamcompactresults2 = merged_results[[\'year\', \'team2\']].copy()<br><br>    train = Aggregate(teamcompactresults1,<br>                      teamcompactresults2,<br>                      merged_results,<br>                      regularseasoncompactresults)<br><br>    sample[\'year\'] = sample.Id.apply(lambda x: str(x)[:4]).astype(int)<br>    sample[\'team1\'] = sample.Id.apply(lambda x: str(x)[5:9]).astype(int)<br>    sample[\'team2\'] = sample.Id.apply(lambda x: str(x)[10:14]).astype(int)<br><br>    merged_results = pd.merge(how=\'left\',<br>                              left=sample,<br>                              right=tourneyseeds,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'Season\', \'Team\'])<br>    merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)<br>    merged_results.rename(columns={\'Seed\': \'team1Seed\'}, inplace=True)<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=tourneyseeds,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'Season\', \'Team\'])<br>    merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)<br>    merged_results.rename(columns={\'Seed\': \'team2Seed\'}, inplace=True)<br>    merged_results[\'team1Seed\'] = \\<br>        merged_results[\'team1Seed\'].apply(lambda x: str(x)[1:3]).astype(int)<br>    merged_results[\'team2Seed\'] = \\<br>        merged_results[\'team2Seed\'].apply(lambda x: str(x)[1:3]).astype(int)<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=winsteam1,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'year\', \'team1\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=lossesteam1,<br>                              left_on=[\'year\', \'team1\'],<br>                              right_on=[\'year\', \'team1\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=winsteam2,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'year\', \'team2\'])<br>    merged_results = pd.merge(how=\'left\',<br>                              left=merged_results,<br>                              right=lossesteam2,<br>                              left_on=[\'year\', \'team2\'],<br>                              right_on=[\'year\', \'team2\'])<br><br>    teamcompactresults1 = merged_results[[\'year\', \'team1\']].copy()<br>    teamcompactresults2 = merged_results[[\'year\', \'team2\']].copy()<br><br>    test = Aggregate(teamcompactresults1,<br>                     teamcompactresults2,<br>                     merged_results,<br>                     regularseasoncompactresults)<br><br>    return train, test<br><br><br>if __name__ == ""__main__"":<br>    <br>    train = pd.read_csv(\'../input/TourneyCompactResults.csv\')<br><br>    elo = Elo(125)<br><br>    team = {}<br><br>    for index, row in train.iterrows():<br>        t1 = row[\'Wteam\']<br>        t2 = row[\'Lteam\']<br>        if not t1 in team: team[t1] = 1000.0<br>        if not t2 in team: team[t2] = 1000.0<br><br>        (team[t1], team[t2]) = elo.rate_1vs1(team[t1], team[t2])<br>    #print(team)<br><br>    elo = Elo(140)<br><br>    team2 = {}<br><br>    for index, row in train.iterrows():<br>        t1 = row[\'Wteam\']<br>        t2 = row[\'Lteam\']<br>        if not t1 in team2: team2[t1] = 1020.0<br>        if not t2 in team2: team2[t2] = 1020.0<br><br>        (team2[t1], team2[t2]) = elo.rate_1vs1(team2[t1], team2[t2])<br><br><br>    <br>    train, test = GrabData()<br>    trainlabels = train.result.values<br>    train.drop(\'result\', inplace=True, axis=1)<br>    train.fillna(-1, inplace=True)<br>    testids = test.Id.values<br>    test.drop([\'Id\', \'Pred\'], inplace=True, axis=1)<br>    test.fillna(-1, inplace=True)<br>    ss = StandardScaler()<br>    train[train.columns] = np.round(ss.fit_transform(train), 6)<br>    predictions = GPIndividual1(train)<br>    predictions.fillna(1, inplace=True)<br>    print(log_loss(trainlabels, np.clip(predictions.values, .01, .99)))<br>    test[test.columns] = np.round(ss.transform(test), 6)<br>    predictions = GPIndividual1(test)<br>    predictions.fillna(1, inplace=True)<br>    <br>                               <br>    preds = pd.read_csv(\'../input/SampleSubmission.csv\')<br>    prediction = np.zeros((preds.shape[0], 1))<br>    i = 0<br>    for index, row in preds.iterrows():<br>        p = list(map(int, str.split(str(row[\'Id\']), \'_\')))<br>        #prediction[i] = 0.5 + 0.3*(team[p[1]] - team[p[2]]) / 480 + 0.7*(team2[p[1]] - team2[p[2]])/520<br>        prediction[i] = predictions.values[i]<br>        i += 1<br>    <br>    preds[\'Pred\'] = np.clip(prediction, 0.07, 0.93)<br>    preds.to_csv(\'Prediction.csv\', index=False)",No,5,53.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br>from sklearn.cross_validation import train_test_split<br>from sklearn.ensemble import RandomForestRegressor<br>from sklearn.metrics import mean_squared_error<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")",No,5,45.0
"def loadData(df, test = None):
    
    dt = pd.to_datetime(df.datetime).dt
    df[""Year""] = dt.year
    df[""Month""] = dt.month
    df[""Day""] = dt.day
    df[""Hour""] = dt.hour
    
    df.drop(""datetime"", axis = 1, inplace = True)
    if not test:
        df.drop(""casual"", axis = 1, inplace = True)
        df.drop(""registered"", axis = 1, inplace = True)
    if test:
        y = None
    else:
        y = df[""count""]
        df.drop(""count"", axis = 1, inplace = True)
        
    X = df
    
    return X, y
        
        ",No,4,21.0
"X, y = loadData(train)",No,5,21.0
new_y = np.log(y + 1),No,4,21.0
"# use a full grid over all parameters<br>\'\'\'<br>from sklearn.grid_search import GridSearchCV, RandomizedSearchCV<br>from time import time<br>param_grid = {""max_depth"": [3, None],<br>              ""max_features"": [1, 3, 10],<br>              ""min_samples_split"": [1, 3, 10],<br>              ""min_samples_leaf"": [1, 3, 10],<br>              ""bootstrap"": [True, False]}<br><br>clf = RandomForestRegressor(n_estimators=20)<br># run grid search<br>grid_search = GridSearchCV(clf, param_grid=param_grid)<br>start = time()<br>grid_search.fit(X, y)<br><br>print(""GridSearchCV took %.2f seconds for %d candidate parameter settings.""<br>      % (time() - start, len(grid_search.grid_scores_)))<br>#report(grid_search.grid_scores_)<br>\'\'\'",No,5,6.0
"# RF
X_test, _ = loadData(test, test = True)
rf = RandomForestRegressor().fit(X, new_y)
prediction = rf.predict(X_test)
",No,4,48.0
"### xgb
#import xgboost as xgb
#gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
#predictions = gbm.predict(test_X)",No,4,48.0
"### Get submission
sample = pd.read_csv(""../input/sampleSubmission.csv"")
submission = pd.DataFrame()
submission[""datetime""] = sample[""datetime""]
submission[""count""] = pd.Series(prediction)
submission.to_csv(""sub.csv"", index = False)",No,3,55.0
"print(check_output([""head"", ""../input/sampleSubmission.csv""]).decode(""utf8""))",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"import random

def random_sampler(filename, k):
    sample = []
    with open(filename, 'rb') as f:
        f.seek(0, 2)
        filesize = f.tell()
        
        random_set = sorted(random.sample(range(filesize), k))
       
        for i in range(k):
            f.seek(random_set[i])
            # Skip current line (because we might be in the middle of a line) 
            f.readline()
            # Append the next line to the sample set 
            sample.append(f.readline().rstrip())

    return sample",No,5,15.0
"TRAIN_SAMPLES = 5*10**6
train_sample = random_sampler('../input/train.csv', TRAIN_SAMPLES)",No,5,15.0
"train_sample[0].decode().split(',')",No,4,78.0
"train_sample_ = [row.decode().split("","") for row in train_sample]",No,5,78.0
train = pd.DataFrame(train_sample_),No,5,12.0
"train_df = pd.read_csv('../input/train.csv', nrows=1)
train.columns = train_df.columns",No,3,45.0
train = train.apply(pd.to_numeric),No,5,16.0
train.info(),No,5,40.0
train['Demanda_uni_equil'] = np.log1p(train['Demanda_uni_equil']),No,4,21.0
"x_cols = train.columns
x_cols = x_cols.drop(['Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima', 'Dev_proxima', 'Demanda_uni_equil'])
print(x_cols)",No,5,10.0
"from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()",No,5,4.0
"model.fit(train[x_cols], train['Demanda_uni_equil'])",No,5,7.0
"test = pd.read_csv('../input/test.csv')
test.info()",No,4,45.0
test['Demanda_uni_equil'] = np.expm1(model.predict(test[x_cols])),No,5,48.0
"test[['id', 'Demanda_uni_equil']].to_csv('predictions_rf_random_sampling.csv', index=False)",No,5,25.0
"import pandas as pd<br>import numpy as np<br>from scipy import sparse as ssp<br>import pylab as plt<br>from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder<br>from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer<br>from sklearn.decomposition import TruncatedSVD,NMF,PCA,FactorAnalysis<br>from sklearn.feature_selection import SelectFromModel,SelectPercentile,f_classif<br>from sklearn.decomposition import TruncatedSVD<br>from sklearn.metrics import log_loss,roc_auc_score<br>from sklearn.pipeline import Pipeline,make_pipeline<br>from sklearn.cross_validation import StratifiedKFold,KFold<br>from keras.preprocessing import sequence<br>from keras.callbacks import ModelCheckpoint,Callback<br>from keras import backend as K<br>from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda,AveragePooling1D,Reshape<br>from keras.layers.normalization import BatchNormalization<br>from keras.optimizers import SGD<br>from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU<br>from keras.models import Model<br><br>seed = 1<br>np.random.seed(seed)<br>dim = 32<br>hidden=64<br><br>path = ""../input/""<br><br>class AucCallback(Callback):  #inherits from Callback<br>    <br>    def __init__(self, validation_data=(), patience=25,is_regression=True,best_model_name=\'best_keras.mdl\',feval=\'roc_auc_score\',batch_size=1024*8):<br>        super(Callback, self).__init__()<br>        <br>        self.patience = patience<br>        self.X_val, self.y_val = validation_data  #tuple of validation X and y<br>        self.best = -np.inf<br>        self.wait = 0  #counter for patience<br>        self.best_model=None<br>        self.best_model_name = best_model_name<br>        self.is_regression = is_regression<br>        self.y_val = self.y_val#.astype(np.int)<br>        self.feval = feval<br>        self.batch_size = batch_size<br>    def on_epoch_end(self, epoch, logs={}):<br>        p = self.model.predict(self.X_val,batch_size=self.batch_size, verbose=0)#.ravel()<br>        if self.feval==\'roc_auc_score\':<br>            current = roc_auc_score(self.y_val,p)<br><br>        if current > self.best:<br>            self.best = current<br>            self.wait = 0<br>            self.model.save_weights(self.best_model_name,overwrite=True)<br>            <br><br>        else:<br>            if self.wait >= self.patience:<br>                self.model.stop_training = True<br>                print(\'Epoch %05d: early stopping\' % (epoch))<br>                <br>                <br>            self.wait += 1 #incremental the number of times without improvement<br>        print(\'Epoch %d Auc: %f | Best Auc: %f \<br>\' % (epoch,current,self.best))<br><br><br>def make_batches(size, batch_size):<br>    nb_batch = int(np.ceil(size/float(batch_size)))<br>    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]<br><br><br><br>def main():<br>    train = pd.read_csv(path+\'act_train.csv\')<br>    test = pd.read_csv(path+\'act_test.csv\')<br>    people = pd.read_csv(path+\'people.csv\')<br>    columns = people.columns<br>    test[\'outcome\'] = np.nan<br>    data = pd.concat([train,test])<br>    <br>    data = pd.merge(data,people,how=\'left\',on=\'people_id\').fillna(\'missing\')<br>    train = data[:train.shape[0]]<br>    test = data[train.shape[0]:]<br><br><br><br>    columns = train.columns.tolist()<br>    columns.remove(\'activity_id\')<br>    columns.remove(\'outcome\')<br>    data = pd.concat([train,test])<br>    for c in columns:<br>        data[c] = LabelEncoder().fit_transform(data[c].values)<br><br>    train = data[:train.shape[0]]<br>    test = data[train.shape[0]:]<br>    <br>    data = pd.concat([train,test])<br>    columns = train.columns.tolist()<br>    columns.remove(\'activity_id\')<br>    columns.remove(\'outcome\')<br>    flatten_layers = []<br>    inputs = []<br>    count=0<br>    for c in columns:<br>        <br>        inputs_c = Input(shape=(1,), dtype=\'int32\')<br><br>        num_c = len(np.unique(data[c].values))<br><br>        embed_c = Embedding(<br>                        num_c,<br>                        dim,<br>                        dropout=0.2,<br>                        input_length=1<br>                        )(inputs_c)<br>        flatten_c= Flatten()(embed_c)<br><br>        inputs.append(inputs_c)<br>        flatten_layers.append(flatten_c)<br>        count+=1<br><br>    flatten = merge(flatten_layers,mode=\'concat\')<br>    reshaped_flatten = Reshape((count,dim))(flatten)<br>    <br>    conv_1 = Convolution1D(nb_filter=16,<br>                        filter_length=3,<br>                        border_mode=\'same\',<br>                        activation=\'relu\',<br>                        subsample_length=1)(reshaped_flatten)<br>    pool_1 = MaxPooling1D(pool_length=int(count/2))(conv_1)<br>    <br>    flatten = Flatten()(pool_1)<br>    <br>    <br>    fc1 = Dense(hidden,activation=\'relu\')(flatten)<br>    dp1 = Dropout(0.5)(fc1)<br><br>    outputs = Dense(1,activation=\'sigmoid\')(dp1)<br><br>    model = Model(input=inputs, output=outputs)<br>    model.compile(<br>                optimizer=\'adam\',<br>                loss=\'binary_crossentropy\',<br>              )<br><br>    del data<br><br>    X = train[columns].values<br>    X_t = test[columns].values<br>    y = train[""outcome""].values<br>    people_id = train[""people_id""].values<br>    activity_id = test[\'activity_id\']<br>    del train<br>    del test<br><br>    skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=seed)<br>    for ind_tr, ind_te in skf:<br>        X_train = X[ind_tr]<br>        X_test = X[ind_te]<br><br>        y_train = y[ind_tr]<br>        y_test = y[ind_te]<br>        break<br>    <br>    X_train = [X_train[:,i] for i in range(X.shape[1])]<br>    X_test = [X_test[:,i] for i in range(X.shape[1])]<br>    <br>    del X<br><br>    model_name = \'mlp_residual_%s_%s.hdf5\'%(dim,hidden)<br>    model_checkpoint = ModelCheckpoint(model_name, monitor=\'val_loss\', save_best_only=True)<br>    auc_callback = AucCallback(validation_data=(X_test,y_test), patience=5,is_regression=True,best_model_name=path+\'best_keras.mdl\',feval=\'roc_auc_score\')<br>    <br>    nb_epoch = 2<br><br>    batch_size = 1024*8<br>    load_model = False<br>    <br>    if load_model:<br>        print(\'Load Model\')<br>        model.load_weights(path+model_name)<br>        # model.load_weights(path+\'best_keras.mdl\')<br><br>    model.fit(<br>        X_train, <br>        y_train,<br>        batch_size=batch_size, <br>        nb_epoch=nb_epoch, <br>        verbose=1, <br>        shuffle=True,<br>        validation_data=[X_test,y_test],<br>        # callbacks = [<br>            # model_checkpoint,<br>            # auc_callback,<br>            # ],<br>        )<br>    <br>    # model.load_weights(model_name)<br>    # model.load_weights(path+\'best_keras.mdl\')<br>    <br>    y_preds = model.predict(X_test,batch_size=1024*8)<br>    # print(\'auc\',roc_auc_score(y_test,y_preds))<br>    <br>    # print(\'Make submission\')<br>    X_t = [X_t[:,i] for i in range(X_t.shape[1])]<br>    outcome = model.predict(X_t,batch_size=1024*8)<br>    submission = pd.DataFrame()<br>    submission[\'activity_id\'] = activity_id<br>    submission[\'outcome\'] = outcome<br>    submission.to_csv(\'submission_residual_%s_%s.csv\'%(dim,hidden),index=False)<br><br>main()<br>",No,3,45.0
"%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss",No,5,23.0
"gatrain = pd.read_csv('../input/gender_age_train.csv')
gatest = pd.read_csv('../input/gender_age_test.csv')
gatrain.head(3)",No,4,45.0
"letarget = LabelEncoder().fit(gatrain.group.values)
y = letarget.transform(gatrain.group.values)
n_classes = len(letarget.classes_)",No,4,20.0
"phone = pd.read_csv('../input/phone_brand_device_model.csv',encoding='utf-8')
phone.head(3)",No,4,45.0
"phone = phone.drop_duplicates('device_id', keep='first')",No,5,19.0
"lebrand = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = lebrand.transform(phone.phone_brand)
m = phone.phone_brand.str.cat(phone.device_model)
lemodel = LabelEncoder().fit(m)
phone['model'] = lemodel.transform(m)",No,5,20.0
"train = gatrain.merge(phone[['device_id','brand','model']], how='left',on='device_id')",No,5,32.0
"class GenderAgeGroupProb(object):
    def __init__(self):
        pass
    
    def fit(self, df, by, n_smoothing, weights):
        self.by = by
        self.n_smoothing = n_smoothing
        self.weights = np.divide(weights,sum(weights))
        self.classes_ = sorted(df['group'].unique())
        self.n_classes_ = len(self.classes_)
        
        self.group_freq = df['group'].value_counts().sort_index()/df.shape[0]
        
        self.prob_by = []
        for i,b in enumerate(self.by):
            c = df.groupby([b,'group']).size().unstack().fillna(0)
            total = c.sum(axis=1)
            prob = (c.add(self.n_smoothing[i]*self.group_freq)).div(total+self.n_smoothing[i], axis=0)
            self.prob_by.append(prob)
        return self
    
    def predict_proba(self, df):
        pred = pd.DataFrame(np.zeros((len(df.index),self.n_classes_)),columns=self.classes_,index=df.index)
        pred_by = []
        for i,b in enumerate(self.by):
            pred_by.append(df[[b]].merge(self.prob_by[i], how='left',
                                      left_on=b, right_index=True).fillna(self.group_freq)[self.classes_])
            pred = pred.radd(pred_by[i].values*self.weights[i])
        
        pred.loc[pred.iloc[:,0].isnull(),:] = self.group_freq
        return pred[self.classes_].values
    
def score(ptrain, by, n_smoothing, weights=[0.5,0.5]):
    kf = KFold(ptrain.shape[0], n_folds=10, shuffle=True, random_state=0)
    pred = np.zeros((ptrain.shape[0],n_classes))
    for itrain, itest in kf:
        train = ptrain.iloc[itrain,:]
        test = ptrain.iloc[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = GenderAgeGroupProb().fit(train,by,n_smoothing,weights)
        pred[itest,:] = clf.predict_proba(test)
    return log_loss(y, pred)",No,4,7.0
"n_smoothing = [1,5,10,15,20,50,100]
res = [score(train,['brand','model'],[s,s],[.5,.5]) for s in n_smoothing]
plt.plot(n_smoothing, res)
plt.title('Best score {:.5f} at n_smoothing = {}'.format(np.min(res),n_smoothing[np.argmin(res)]))
plt.xlabel('n_smoothing')",No,4,81.0
"brand_weight = [0,0.2,0.4,0.6,0.8,1.0]
res = [score(train,['brand','model'],[15,15],[b,1-b]) for b in brand_weight]
plt.plot(brand_weight, res)
plt.title('Best score {:.5f} at brand_weight = {}'.format(np.min(res),brand_weight[np.argmin(res)]))
plt.xlabel('brand_weight')",No,5,81.0
"test = gatest.merge(phone[['device_id','brand','model']], how='left',on='device_id')
test.head(3)",No,5,32.0
"clf = GenderAgeGroupProb().fit(train,['brand','model'],[15,15],[0.4,0.6])
pred = clf.predict_proba(test)",No,4,7.0
"pd.DataFrame(pred, 
             index = test.device_id, 
             columns=clf.classes_).to_csv('pbm_subm.csv', index=True)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import xgboost as xgb<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>#from subprocess import check_output<br>#print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.",Yes,3,10.0
"def getVariables(value=1000):
    for var, obj in globals().items():
        try:
            if(sys.getsizeof(obj) > value and not var.startswith(""_"")):
                    print (""{0:30} {1:5}"".format(var, sys.getsizeof(obj)))
        except:
            continue",No,3,23.0
"def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1)) ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5",No,5,84.0
"print ('Loading Test...')
dtype_test = {'id':np.uint32,
              'Semana': np.uint8, 
              'Agencia_ID': np.uint16, 
              'Canal_ID': np.uint8,
              'Ruta_SAK': np.uint16, 
              'Cliente_ID': np.uint32, 
              'Producto_ID': np.uint16}

%time test = pd.read_csv('../input/test.csv', usecols=dtype_test.keys(), dtype=dtype_test)
test.head()",No,5,45.0
test.shape,No,5,58.0
"dtype = {'Semana': np.uint8, 
         'Agencia_ID': np.uint16, 
         'Canal_ID': np.uint8,
         'Ruta_SAK': np.uint16, 
         'Cliente_ID': np.uint32, 
         'Producto_ID': np.uint16,
         'Demanda_uni_equil': np.uint16}

filename='../input/train.csv'


%time train = pd.read_csv(filename, usecols=dtype.keys(), dtype=dtype, warn_bad_lines= True,engine='c')
train.head()",No,4,45.0
"train = train[train[""Semana""]>8]<br>print (\'Training_Shape:\', train.shape)",No,4,14.0
"ids = test['id']
test = test.drop(['id'],axis = 1)

y = train['Demanda_uni_equil']
X = train[test.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)
del(train)
print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)
del(X)
del(y)",No,3,21.0
"params = {}<br>params[\'objective\'] = ""reg:linear""<br>params[\'eta\'] = 0.1<br>params[\'max_depth\'] = 5<br>params[\'subsample\'] = 0.8<br>params[\'colsample_bytree\'] = 0.6<br>params[\'silent\'] = True<br>#params[\'nthread\']= 4<br>params[\'booster\'] = ""gbtree""<br><br><br>test_preds = np.zeros(test.shape[0])<br>xg_train = xgb.DMatrix(X_train, label=y_train)<br>del(X_train)<br>del(y_train)<br>xg_test = xgb.DMatrix(X_test)<br>del(X_test)<br>watchlist = [(xg_train, \'train\')]",No,4,59.0
"num_rounds = 20
%time xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10)
del(xg_train)",No,5,7.0
"preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)
print ('RMSLE Score:', rmsle(y_test, preds)) ",No,3,49.0
"print ('RMSLE Score:', rmsle(y_test, preds)) 
del(preds)
del(y_test)",No,5,49.0
"import numpy as np
import pandas as pd
import datetime

act_train = pd.read_csv('../input/act_train.csv')
act_test = pd.read_csv('../input/act_test.csv')
people = pd.read_csv('../input/people.csv')
people.sample(10)",Yes,4,8.0
"def process_dates(data,min_date):
    #min_date=data.min()
    min_date 
    data=data.apply(lambda x: (datetime.datetime.strptime(x,""%Y-%m-%d"")
                                    -datetime.datetime.strptime(min_date,""%Y-%m-%d"")).days)
    data
    return data",No,4,8.0
"import pandas as pd<br>import numpy as np<br>import datetime<br>import pandas as pd<br>import numpy as np<br>from sklearn.cross_validation import KFold<br>from sklearn.cross_validation import train_test_split<br>from sklearn.metrics import roc_auc_score<br>import xgboost as xgb<br>import random<br>from operator import itemgetter<br>import time<br>import copy<br><br>random.seed(2016)<br><br><br>def create_feature_map(features):<br>    outfile = open(\'xgb.fmap\', \'w\')<br>    for i, feat in enumerate(features):<br>        outfile.write(\'{0}\\t{1}\\tq\<br>\'.format(i, feat))<br>    outfile.close()<br><br><br>def get_importance(gbm, features):<br>    create_feature_map(features)<br>    importance = gbm.get_fscore(fmap=\'xgb.fmap\')<br>    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)<br>    return importance<br><br><br>def intersect(a, b):<br>    return list(set(a) & set(b))<br><br>def get_features(train, test):<br>    trainval = list(train.columns.values)<br>    testval = list(test.columns.values)<br>    output = intersect(trainval, testval)<br>    output.remove(\'people_id\')<br>    return sorted(output)<br><br>def run_single(train, test, features, target, random_state=0):<br>    eta = 1.3<br>    max_depth = 3<br>    subsample = 0.8<br>    colsample_bytree = 0.8<br>    start_time = time.time()<br><br>    print(\'XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}\'.format(eta, max_depth, subsample, colsample_bytree))<br>    params = {<br>        ""objective"": ""binary:logistic"",<br>        ""booster"" : ""gbtree"",<br>        ""eval_metric"": ""auc"",<br>        ""eta"": eta,<br>        ""tree_method"": \'exact\',<br>        ""max_depth"": max_depth,<br>        ""subsample"": subsample,<br>        ""colsample_bytree"": colsample_bytree,<br>        ""silent"": 1,<br>        ""seed"": random_state,<br>    }<br>    num_boost_round = 115<br>    early_stopping_rounds = 10<br>    test_size = 0.1<br><br>    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)<br>    print(\'Length train:\', len(X_train.index))<br>    print(\'Length valid:\', len(X_valid.index))<br>    y_train = X_train[target]<br>    y_valid = X_valid[target]<br>    dtrain = xgb.DMatrix(X_train[features], y_train)<br>    dvalid = xgb.DMatrix(X_valid[features], y_valid)<br><br>    watchlist = [(dtrain, \'train\'), (dvalid, \'eval\')]<br>    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)<br><br>    print(""Validating..."")<br>    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)<br>    score = roc_auc_score(X_valid[target].values, check)<br>    print(\'Check error value: {:.6f}\'.format(score))<br><br>    imp = get_importance(gbm, features)<br>    print(\'Importance array: \', imp)<br><br>    print(""Predict test set..."")<br>    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)<br><br>    print(\'Training time: {} minutes\'.format(round((time.time() - start_time)/60, 2)))<br>    return test_prediction.tolist()<br><br><br>def simple_load():<br><br>    print(""Read people.csv..."")<br>    people = pd.read_csv(""../input/people.csv"",<br>                       dtype={\'people_id\': np.str,<br>                              \'activity_id\': np.str,<br>                              \'char_38\': np.int32},<br>                       parse_dates=[\'date\'])<br><br>    print(""Load train.csv..."")<br>    train = pd.read_csv(""../input/act_train.csv"",<br>                        dtype={\'people_id\': np.str,<br>                               \'activity_id\': np.str,<br>                               \'outcome\': np.int8},<br>                        parse_dates=[\'date\'])<br><br>    print(""Load test.csv..."")<br>    test = pd.read_csv(""../input/act_test.csv"",<br>                       dtype={\'people_id\': np.str,<br>                              \'activity_id\': np.str},<br>                       parse_dates=[\'date\'])<br><br>    print(""Process tables..."")<br>    for table in [train, test]:<br>        table[\'activity_category\'] = table[\'activity_category\'].str.lstrip(\'type \').astype(np.int32)<br>        for i in range(1, 11):<br>            table[\'char_\' + str(i)].fillna(\'type -999\', inplace=True)<br>            table[\'char_\' + str(i)] = table[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)<br>    people[\'year\'] = people[\'date\'].dt.year<br>    people[\'month\'] = people[\'date\'].dt.month<br>    people[\'day\'] = people[\'date\'].dt.day<br>    people[\'weekday\'] = people[\'date\'].dt.weekday<br>    people[\'weekend\'] = ((people.weekday == 0) | (people.weekday == 6)).astype(int)<br>    people.drop(\'date\', axis=1, inplace=True)<br>    people[\'group_1\'] = people[\'group_1\'].str.lstrip(\'group \').astype(np.int32)<br>    for i in range(1, 10):<br>        people[\'char_\' + str(i)] = people[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)<br>    for i in range(10, 38):<br>        people[\'char_\' + str(i)] = people[\'char_\' + str(i)].astype(np.int32)<br><br>    print(""Merge..."")<br>    train = train.merge(people, on=""people_id"", suffixes=(""_act"", """"))<br>    test = test.merge(people, on=""people_id"", suffixes=(""_act"", """"))<br>    <br>    # Set index to activity id<br>    train = train.set_index(""activity_id"")<br>    test = test.set_index(""activity_id"")<br>    return train, test<br><br><br>def group_decision(train, test, only_certain=True):<br>    # Exploit the leak revealed by Loiso and team to try and directly infer any labels that can be inferred<br>    # https://www.kaggle.com/c/predicting-red-hat-business-value/forums/t/22807/0-987-kernel-now-available-seems-like-leakage<br><br>    # Make a lookup dataframe, and copy those in first since we can be sure of them<br>    lookup = train.groupby([""group_1"", ""date_act""], as_index=False)[""outcome""].mean()<br>    test = pd.merge(test.reset_index(), lookup, how=""left"", on=[""group_1"", ""date_act""]).set_index(""activity_id"")<br><br>    # Create some date filling columns that we\'ll use after we append<br>    train[""date_act_fillfw""] = train[""date_act""]<br>    train[""date_act_fillbw""] = train[""date_act""]<br><br>    # Create some group filling columns for later use<br>    train[""group_fillfw""] = train[""group_1""]<br>    train[""group_fillbw""] = train[""group_1""]<br><br>    # Put the two data sets together and sort<br>    df = train.append(test)<br>    df = df.sort_values(by=[""group_1"", ""date_act""])<br><br>    # Fill the dates<br>    df[""date_act_fillfw""] = df[""date_act_fillfw""].fillna(method=""ffill"")<br>    df[""date_act_fillbw""] = df[""date_act_fillbw""].fillna(method=""bfill"")<br><br>    # Fill labels<br>    df[""outcome_fillfw""] = df[""outcome""].fillna(method=""ffill"")<br>    df[""outcome_fillbw""] = df[""outcome""].fillna(method=""bfill"")<br><br>    # Fill the groups<br>    df[""group_fillfw""] = df[""group_fillfw""].fillna(method=""ffill"")<br>    df[""group_fillbw""] = df[""group_fillbw""].fillna(method=""bfill"")<br><br>    # Create int booleans for whether the fillers are from the same date<br>    df[""fw_same_date""] = (df[""date_act_fillfw""] == df[""date_act""]).astype(int)<br>    df[""bw_same_date""] = (df[""date_act_fillbw""] == df[""date_act""]).astype(int)<br><br>    # Create int booleans for whether the fillers are in the same group<br>    df[""fw_same_group""] = (df[""group_fillfw""] == df[""group_1""]).astype(int)<br>    df[""bw_same_group""] = (df[""group_fillbw""] == df[""group_1""]).astype(int)<br><br>    # Use the filled labels only if the labels were from the same group, unless we\'re at the end of the group<br>    df[""interfill""] = (df[""outcome_fillfw""] *<br>                       df[""fw_same_group""] +<br>                       df[""outcome_fillbw""] *<br>                       df[""bw_same_group""]) / (df[""fw_same_group""] +<br>                                               df[""bw_same_group""])<br><br>    # If the labels are at the end of the group, cushion by 0.5<br>    df[""needs cushion""] = (df[""fw_same_group""] * df[""bw_same_group""] - 1).abs()<br>    df[""cushion""] = df[""needs cushion""] * df[""interfill""] * -0.1 + df[""needs cushion""] * 0.05<br>    df[""interfill""] = df[""interfill""] + df[""cushion""]<br><br>    # Fill everything<br>    df[""outcome""] = df[""outcome""].fillna(df[""interfill""])<br><br>    if only_certain == True:<br>        # Drop anything we\'re not 100% certain of<br>        df = df[(df[""outcome""] == 0.0) | (df[""outcome""] == 1.0)]<br><br>    # Return outcomes to the original index<br>    test[""outcome""] = df[""outcome""]<br><br>    return test[""outcome""]<br><br>def xgboost_return(train,test,features):<br>    print(""Process tables... "")<br>    for table in [train, test]:<br>        table[\'year\'] = table[\'date\'].dt.year<br>        table[\'month\'] = table[\'date\'].dt.month<br>        table[\'day\'] = table[\'date\'].dt.day<br>        table[\'weekday\'] = table[\'date\'].dt.weekday<br>        table[\'weekend\'] = ((table.weekday == 0) | (table.weekday == 6)).astype(int)<br>        table.drop(\'date\', axis=1, inplace=True)<br>    features.remove(\'date\')<br>    features.remove(\'date_act\')<br>    test[""extra outcomes""] = run_single(train,test,features,""outcome"")<br>    return test[""extra outcomes""]<br><br>def model():<br><br>    # Load in the data set simply by merging together<br>    train, test = simple_load()<br>    <br>    # Get features<br>    features = get_features(train,test)<br><br>    # Try to just infer the correct dates using the data leak<br>    test[""outcome""] = group_decision(train, test, only_certain=False)<br><br>    # Write the inferred predictions to a template<br>    test.reset_index()[[""activity_id"", ""outcome""]].to_csv(""starter_template.csv"", index=False)<br><br>    # Fill any missing rows with the mean of the whole column<br>    test[""outcome""] = test[""outcome""].fillna(xgboost_return(train,test,features))<br><br>    return test.reset_index()[[""activity_id"", ""outcome""]]<br><br><br>def main():<br><br>    # Write a benchmark file to the submissions folder<br>    model().to_csv(""submission.csv"", index=False)<br><br>if __name__ == ""__main__"":<br>    main()",No,5,53.0
sub_df = pd.read_csv('../input/sample_submission.csv'),No,5,45.0
"# Size of the dataframe

print(dataset.shape)

# We can see that there are 15120 instances having 55 attributes

#Learning : Data is loaded successfully as dimensions match the data description",No,5,58.0
"# Datatypes of the attributes

print(dataset.dtypes)

# Learning : Data types of all attributes has been inferred as int64",No,5,70.0
sub_df.head(),No,5,41.0
"# Statistical description

pandas.set_option('display.max_columns', None)
print(dataset.describe())

# Learning :
# No attribute is missing as count is 15120 for all attributes. Hence, all rows can be used
# Negative value(s) present in Vertical_Distance_To_Hydrology. Hence, some tests such as chi-sq cant be used.
# Wilderness_Area and Soil_Type are one hot encoded. Hence, they could be converted back for some analysis
# Attributes Soil_Type7 and Soil_Type15 can be removed as they are constant
# Scales are not the same for all. Hence, rescaling and standardization may be necessary for some algos",No,3,23.0
train_df = pd.read_csv('../input/act_train.csv'),No,5,45.0
"train_df['activity_category'] = train_df['activity_category'].astype('category').cat.codes
columns = ['char_'+str(i) for i in range(1,11)]
train_df[columns] = train_df[columns].apply(lambda x: x.astype('category').cat.codes)
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['day'] = train_df['date'].apply(lambda x:x.day)
train_df['year'] = train_df['date'].apply(lambda x:x.year)
train_df['month'] = train_df['date'].apply(lambda x:x.month)
train_df = train_df.drop(['date'],axis = 1)",No,3,8.0
"# Skewness of the distribution

print(dataset.skew())

# Values close to 0 show less skew
# Several attributes in Soil_Type show a large skew. Hence, some algos may benefit if skew is corrected",No,4,40.0
"# Number of instances belonging to each class

dataset.groupby('Cover_Type').size()

# We see that all classes have an equal presence. No class re-balancing is necessary",No,5,60.0
"import numpy<br><br># Correlation tells relation between two attributes.<br># Correlation requires continous data. Hence, ignore Wilderness_Area and Soil_Type as they are binary<br><br>#sets the number of features considered<br>size = 10 <br><br>#create a dataframe with only \'size\' features<br>data=dataset.iloc[:,:size] <br><br>#get the names of all the columns<br>cols=data.columns <br><br># Calculates pearson co-efficient for all combinations<br>data_corr = data.corr()<br><br># Set the threshold to select only only highly correlated attributes<br>threshold = 0.5<br><br># List of pairs along with correlation above threshold<br>corr_list = []<br><br>#Search for the highly correlated pairs<br>for i in range(0,size): #for \'size\' features<br>    for j in range(i+1,size): #avoid repetition<br>        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):<br>            corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index<br><br>#Sort to show higher ones first            <br>s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))<br><br>#Print correlations and column names<br>for v,i,j in s_corr_list:<br>    print (""%s and %s = %.2f"" % (cols[i],cols[j],v))<br><br># Strong correlation is observed between the following pairs<br># This represents an opportunity to reduce the feature set through transformations such as PCA",No,5,53.0
"#import plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of only the highly correlated pairs
for v,i,j in s_corr_list:
    sns.pairplot(dataset, hue=""Cover_Type"", size=6, x_vars=cols[i],y_vars=cols[j] )
    plt.show()

#The plots show to which class does a point belong to. The class distribution overlaps in the plots.    
#Hillshade patterns give a nice ellipsoid patterns with each other
#Aspect and Hillshades attributes form a sigmoid pattern
#Horizontal and vertical distance to hydrology give an almost linear pattern.",No,5,33.0
"# We will visualize all the attributes using Violin Plot - a combination of box and density plots

#names of all the attributes 
cols = dataset.columns

#number of attributes (exclude target)
size = len(cols)-1

#x-axis has target attribute to distinguish between classes
x = cols[size]

#y-axis shows values of an attribute
y = cols[0:size]

#Plot violin for all attributes
for i in range(0,size):
    sns.violinplot(data=dataset,x=x,y=y[i])  
    plt.show()

#Elevation is has a separate distribution for most classes. Highly correlated with the target and hence an important attribute
#Aspect contains a couple of normal distribution for several classes
#Horizontal distance to road and hydrology have similar distribution
#Hillshade 9am and 12pm display left skew
#Hillshade 3pm is normal
#Lots of 0s in vertical distance to hydrology
#Wilderness_Area3 gives no class distinction. As values are not present, others gives some scope to distinguish
#Soil_Type, 1,5,8,9,12,14,18-22, 25-30 and 35-40 offer class distinction as values are not present for many classes",No,5,33.0
"# Group one-hot encoded variables of a category into one single variable<br><br>#names of all the columns<br>cols = dataset.columns<br><br>#number of rows=r , number of columns=c<br>r,c = dataset.shape<br><br>#Create a new dataframe with r rows, one column for each encoded category, and target in the end<br>data = pandas.DataFrame(index=numpy.arange(0, r),columns=[\'Wilderness_Area\',\'Soil_Type\',\'Cover_Type\'])<br><br>#Make an entry in \'data\' for each r as category_id, target value<br>for i in range(0,r):<br>    w=0;<br>    s=0;<br>    # Category1 range<br>    for j in range(10,14):<br>        if (dataset.iloc[i,j] == 1):<br>            w=j-9  #category class<br>            break<br>    # Category2 range        <br>    for k in range(14,54):<br>        if (dataset.iloc[i,k] == 1):<br>            s=k-13 #category class<br>            break<br>    #Make an entry in \'data\' for each r as category_id, target value        <br>    data.iloc[i]=[w,s,dataset.iloc[i,c-1]]<br><br>#Plot for Category1    <br>sns.countplot(x=""Wilderness_Area"", hue=""Cover_Type"", data=data)<br>plt.show()<br>#Plot for Category2<br>plt.rc(""figure"", figsize=(25, 10))<br>sns.countplot(x=""Soil_Type"", hue=""Cover_Type"", data=data)<br>plt.show()<br><br>#(right-click and open the image in a new window for larger size)<br>#WildernessArea_4 has a lot of presence for cover_type 4. Good class distinction<br>#WildernessArea_3 has not much class distinction<br>#SoilType 1-6,10-14,17, 22-23, 29-33,35,38-40 offer lot of class distinction as counts for some are very high",No,3,20.0
"#Removal list initialize
rem = []

#Add constant columns as they don't help in prediction process
for c in dataset.columns:
    if dataset[c].std() == 0: #standard deviation is zero
        rem.append(c)

#drop the columns        
dataset.drop(rem,axis=1,inplace=True)

print(rem)

#Following columns are dropped",No,5,10.0
train_df.corr().outcome,No,5,40.0
"rank_df = pandas.DataFrame(data=[x[7] for x in X_all_add],columns=cols[:c-1])
_ = rank_df.boxplot(rot=90)
#Below plot summarizes the rankings according to the standard feature selection techniques
#Top ranked attributes are ... first 10 attributes, Wilderness_Area1,4 ...Soil_Type 3,4,10,38-40",No,3,12.0
"rank_df = pandas.DataFrame(data=[x[7] for x in X_all_add],columns=cols[:c-1])<br>med = rank_df.median()<br>print(med)<br>#Write medians to output file for exploratory study on ML algorithms<br>with open(""median.csv"", ""w"") as subfile:<br>       subfile.write(""Column,Median\<br>"")<br>       subfile.write(med.to_string())",No,4,40.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.<br><br>print (\'Loading input files..\')<br>print ()<br>people = pd.read_csv(\'../input/people.csv\',<br>                       dtype={\'people_id\': np.str,<br>                              \'activity_id\': np.str,<br>                              \'char_38\': np.int32},<br>                       parse_dates=[\'date\'])<br>train = pd.read_csv(r\'../input/act_train.csv\',<br>                        dtype={\'people_id\': np.str,<br>                               \'activity_id\': np.str,<br>                               \'outcome\': np.int8},<br>                        parse_dates=[\'date\'])<br>test = pd.read_csv(\'../input/act_test.csv\',<br>                       dtype={\'people_id\': np.str,<br>                              \'activity_id\': np.str},<br>                       parse_dates=[\'date\'])<br><br>missing_values = []<br><br>print (\'Train set features\')<br>print (\'------------------\')<br>for col in train:<br>    unique = train[col].unique()<br>    print (str(col) + \' has \' + str(unique.size) + \' unique values\')<br>    <br>    if (True in pd.isnull(unique)):<br>        print (str(col) + \' has \' + str(pd.isnull(train[col]).sum()) + \' missing values\')<br>    print ()<br>    <br>print ()<br><br>print (\'Processing the datasets..\')<br>print ()<br>for data in [train,test]:<br>    for i in range(1,11):<br>        data[\'char_\'+str(i)].fillna(\'type -1\', inplace = \'true\')<br>        data[\'char_\'+str(i)] = data[\'char_\'+str(i)].str.lstrip(\'type \').astype(np.int32)<br>        <br>    data[\'activity_category\'] = data[\'activity_category\'].str.lstrip(\'type \').astype(np.int32)<br>    <br>    data[\'year\'] = data[\'date\'].dt.year<br>    data[\'month\'] = data[\'date\'].dt.month<br>    data[\'day\'] = data[\'date\'].dt.day<br>    data.drop(\'date\', axis=1, inplace=True)<br>    <br>for i in range(1,10):<br>    people[\'char_\' + str(i)] = people[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)<br>for i in range(10, 38):<br>    people[\'char_\' + str(i)] = people[\'char_\' + str(i)].astype(np.int32)<br>    <br>people[\'group_1\'] = people[\'group_1\'].str.lstrip(\'group \').astype(np.int32)<br>people[\'year\'] = people[\'date\'].dt.year<br>people[\'month\'] = people[\'date\'].dt.month<br>people[\'day\'] = people[\'date\'].dt.day<br>people.drop(\'date\', axis=1, inplace=True)<br><br>print (\'Merging the datasets..\')<br>print ()<br><br>train = pd.merge(train, people, how=\'left\', on=\'people_id\', left_index=True)<br>train.fillna(-1, inplace=True)<br>test = pd.merge(test, people, how=\'left\', on=\'people_id\', left_index=True)<br>test.fillna(-1, inplace=True)<br><br>train = train.drop([\'people_id\'], axis=1)<br><br>#Separate label and data<br>Y = train[\'outcome\']<br>X = train.drop([\'outcome\'], axis=1)<br>X = X.iloc[:,1:]<br>from sklearn.model_selection import cross_val_score<br>from sklearn.ensemble import RandomForestClassifier<br>rfc = RandomForestClassifier(n_estimators=96)<br><br>#print(""cv"")<br>#scores = cross_val_score(rfc, X, Y, cv=4)<br>#print (""Mean accuracy of Random Forest: "" + scores.mean())<br>rfc = rfc.fit(X, Y)<br>#drop the people_id<br>test = test.drop([\'people_id\'], axis=1)<br># Get the test data features, skipping the first column \'PassengerId\'<br>test_x = test.iloc[:, 1:]<br><br><br># Predict the outcome values for the test data<br>test_y = list(map(int, rfc.predict(test_x)))<br>#file for submission<br>test[\'outcome\'] = test_y<br>test[[\'activity_id\', \'outcome\']] \\<br>    .to_csv(\'results.csv\', index=False)",No,5,53.0
"import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss",No,5,23.0
"datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))",No,4,45.0
"brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))",No,4,20.0
"people_df = pd.read_csv('../input/people.csv')
people_df.group_1.unique().shape",No,3,45.0
"people_df = pd.read_csv('../input/people.csv')
columns = ['char_'+str(i) for i in range(1,10)]
people_df[columns] = people_df[columns].apply(lambda x: x.astype('category').cat.codes)
people_df['group_1'] = people_df['group_1'].astype('category').cat.codes
people_df['date'] = pd.to_datetime(people_df['date'])
people_df['day'] = people_df['date'].apply(lambda x:x.day)
people_df['year'] = people_df['date'].apply(lambda x:x.year)
people_df['month'] = people_df['date'].apply(lambda x:x.month)
people_df = people_df.drop(['date'],axis = 1)
people_df = people_df.set_index(people_df['people_id'])
people_df.head()",No,4,8.0
"train_X = train_df.join(people_df,on = 'people_id', rsuffix='_people')",No,5,32.0
"Y = train_X['outcome']
X = train_X.drop(['outcome','people_id','people_id_people','activity_id'],axis = 1)",No,5,21.0
"#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators = 10)
#clf = clf.fit(X, Y)
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X, Y)",No,5,7.0
"test_df = pd.read_csv('../input/act_test.csv')
test_df['activity_category'] = test_df['activity_category'].astype('category').cat.codes
columns = ['char_'+str(i) for i in range(1,11)]
test_df[columns] = test_df[columns].apply(lambda x: x.astype('category').cat.codes)
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['day'] = test_df['date'].apply(lambda x:x.day)
test_df['year'] = test_df['date'].apply(lambda x:x.year)
test_df['month'] = test_df['date'].apply(lambda x:x.month)
test_df = test_df.drop(['date'],axis = 1)",No,3,8.0
"test_X = test_df.join(people_df,on = 'people_id', rsuffix='_people')
X = test_X.drop(['people_id','people_id_people','activity_id'],axis = 1)
#output = clf.predict(X)
output = gbm.predict(X)",No,4,48.0
"test_df['outcome'] = output
test_df.to_csv('submission.csv',columns = ['activity_id','outcome'],index = False)",No,5,25.0
import pandas as pd,No,5,22.0
"train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")
submission = pd.read_csv(""../input/sampleSubmission.csv"")",No,5,45.0
"print(""Train dataset:"")
print(train.head())
print(""Test dataset:"")
print(test.head())
print(""Sample submission:"")
print(submission.head())",No,5,41.0
print(train.describe()),No,5,40.0
"mean = train.describe()[""count""][""mean""]",No,5,40.0
"m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",No,4,20.0
"appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()",Yes,3,20.0
"d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))",Yes,4,17.0
"applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)",No,4,20.0
"devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()",Yes,4,32.0
"d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))",No,4,17.0
"Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,11.0
"targetencoder = LabelEncoder().fit(gatrain.gender)
y = targetencoder.transform(gatrain.gender)
nclasses = len(targetencoder.classes_)",No,5,20.0
"clf = LogisticRegression(C=0.08)#, multi_class='multinomial',solver='lbfgs')
clf.fit(Xtrain[70001:], y[70001:])
pred = pd.DataFrame(clf.predict_proba(Xtrain[70001:]), index=gatrain.iloc[70001:].index, columns=targetencoder.classes_)
pred.head()",Yes,3,7.0
"pred.to_csv('test_gender.csv',index=True)",No,5,25.0
"import sys
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import norm, gumbel_r
from scipy.optimize import linprog
import matplotlib
import matplotlib.pyplot as plt
from sklearn.utils.extmath import cartesian
%matplotlib inline",No,5,23.0
"submission.to_csv(""submission.csv"", index=False)",No,5,25.0
"def plot_bag_weight_distributions(bags, size=10000):
    plot_distributions(bags, create_bag_weight_sampler, size=size, fit=norm)

def plot_distributions(bags, sampler_builder, size=10000, fit=None):
    num_plots = len(bags)
    num_cols = int(round(math.sqrt(num_plots)))
    num_rows = (num_plots // num_cols)
    num_rows = num_rows if num_plots % num_cols == 0 else num_rows + 1
    
    f, axes = plt.subplots(num_rows, num_cols)
    axes = axes.reshape(-1)
    for i in range(num_plots):
        current_bag = bags[i]
        current_bag_sampler, current_bag_name = sampler_builder(current_bag)
        current_sample = current_bag_sampler(size)
        print(""{}: mean={} | std={}"".format(current_bag_name, np.mean(current_sample), np.std(current_sample)))
        current_axis = axes[i]
        sns.distplot(current_sample, ax=current_axis, fit=fit, kde=False)
        current_axis.set_title(current_bag_name)
        current_axis.set_yticklabels([])
    plt.tight_layout()
    plt.show()
    
single_gift_bags = [
    {""horse"": 1},
    {""ball"": 1},
    {""bike"": 1},
    {""train"": 1},
    {""coal"": 1},
    {""book"": 1},
    {""doll"": 1},
    {""blocks"": 1},
    {""gloves"": 1}
]

plot_bag_weight_distributions(single_gift_bags)",No,5,33.0
"example_bags = [
    {""horse"": 1, ""ball"": 2},
    {""train"": 3, ""bike"": 1},
    {""coal"": 2, ""book"": 2},
    {""gloves"": 12, ""book"": 12},
]

plot_bag_weight_distributions(example_bags)",No,5,33.0
"def plot_bag_utility_distributions(bags, size=10000, fit=norm):<br>    plot_distributions(bags, create_bag_utility_sampler, size=size, fit=fit)<br><br>def create_bag_utility_sampler(bag):<br>    bag_weight_sampler, bag_name = create_bag_weight_sampler(bag)<br>    def bag_utility_sampler(size=1):<br>        samples = bag_weight_sampler(size)<br>        samples[samples > 50] = 0<br>        return samples<br>    return bag_utility_sampler, bag_name<br><br>bag = { ""horse"": 2, ""ball"": 19 }<br>bag_utility_sampler, name = create_bag_utility_sampler(bag)<br>print(""Sampling utility from bag {}: {}\<br>"".format(name, bag_utility_sampler(9)))<br>plot_bag_utility_distributions(example_bags)",Yes,4,33.0
"def plot_score_distribution(bags, num_tries=60, size=10000, fit=norm, extremal_fit=gumbel_r):<br>    scores = np.zeros(size)<br>    for i, bag in enumerate(bags):<br>        current_bag_sampler, _ = create_bag_utility_sampler(bag)<br>        scores += current_bag_sampler(size)<br>    score_mean, score_std = np.mean(scores), np.std(scores)<br>    print(""Scores: mean = {:0.2f} | std = {:0.2f}"".format(score_mean, score_std))<br>    sns.distplot(scores, fit=fit, kde=False)<br>    <br>    plot_extreme_value_distribution(scores, num_tries)<br>    plt.title(""Score distribution / submission distribution with {} tries"".format(num_tries))<br>    plt.show()<br><br>def plot_extreme_value_distribution(scores, num_tries, size=10000):<br>    samples = np.max(np.random.choice(scores, size=(size, num_tries)), axis=1)<br>    sns.distplot(samples, fit=gumbel_r, kde=False)<br>    expected_score = np.mean(samples)<br>    plt.axvline(expected_score, color=\'r\')<br>    print(""Expected score after {} trials: {:0.2f}"".format(num_tries, expected_score))<br><br>plot_score_distribution(example_bags)",Yes,4,33.0
"def drop_duplicate(candidate_bags, distributions):
    df = pd.DataFrame(data=np.hstack((candidate_bags, distributions)), columns=gifts + [""mean"", ""var""])
    df.drop_duplicates(subset=gifts, inplace=True)
    return df[gifts].values, df[[""mean"", ""var""]].values

candidate_bags = np.vstack([mixed_item_candiadte_bags, low_weight_item_candidate_bags])
bag_weight_distributions = np.vstack([mixed_item_bag_weight_distributions, low_weight_item_bag_weight_distributions])
print(""Combined candiadte bags: {}"".format(candidate_bags.shape))
candidate_bags, bag_weight_distributions = drop_duplicate(candidate_bags, bag_weight_distributions)
print(""Final candidate bags without duplicates: {}"".format(candidate_bags.shape))",Yes,3,11.0
pd.read_csv('submission_20.csv').head(),No,5,41.0
"import numpy as np
import pandas as pd 

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,5,88.0
"train = pd.read_json(""../input/train.json"")
train.info()",No,5,44.0
"X = train.drop(""interest_level"", 1)
Y = train[""interest_level""].astype(""category"")",No,5,21.0
"X[""street_address""] = X[""street_address""].astype(\'category\').cat.codes<br>X[""created""] = X[""created""].astype(\'category\').cat.codes<br>X[""building_id""] = X[""building_id""].astype(\'category\').cat.codes<br>X[""description""] = X[""description""].astype(\'category\').cat.codes<br>X[""display_address""] = X[""display_address""].astype(\'category\').cat.codes<br>X[""manager_id""] = X[""manager_id""].astype(\'category\').cat.codes",No,5,8.0
"features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"", ""street_address"", ""created"", 
           ""description"", ""display_address""]
X = X[features]",No,4,21.0
"from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)",No,5,13.0
"from sklearn.metrics import accuracy_score, log_loss<br>from sklearn.neighbors import KNeighborsClassifier<br>from sklearn.tree import DecisionTreeClassifier<br>from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier<br>from sklearn.naive_bayes import GaussianNB<br>from sklearn.discriminant_analysis import LinearDiscriminantAnalysis<br>from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis<br><br>classifiers = [<br>    KNeighborsClassifier(3),<br>    DecisionTreeClassifier(),<br>    RandomForestClassifier(),<br>    AdaBoostClassifier(),<br>    GradientBoostingClassifier(),<br>    GaussianNB(),<br>    LinearDiscriminantAnalysis(),<br>    QuadraticDiscriminantAnalysis()]<br><br># Logging for Visual Comparison<br>log_cols=[""Classifier"", ""Accuracy"", ""Log Loss""]<br>log = pd.DataFrame(columns=log_cols)<br><br>for clf in classifiers:<br>    clf.fit(X_train, y_train)<br>    name = clf.__class__.__name__<br>    <br>    print(""=""*30)<br>    print(name)<br>    <br>    print(\'****Results****\')<br>    train_predictions = clf.predict(X_test)<br>    acc = accuracy_score(y_test, train_predictions)<br>    print(""Accuracy: {:.4%}"".format(acc))<br>    <br>    train_predictions = clf.predict_proba(X_test)<br>    ll = log_loss(y_test, train_predictions)<br>    print(""Log Loss: {}"".format(ll))<br>    <br>    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)<br>    log = log.append(log_entry)<br>    <br>print(""=""*30)",Yes,3,4.0
"test = pd.read_json(""../input/test.json"")
index = test[""listing_id""]
test = test[features]",No,5,44.0
"olist = list(test.select_dtypes(['object']))
for col in olist:
    test[col] = test[col].astype('category').cat.codes",No,5,8.0
"favorite_clf = LinearDiscriminantAnalysis()
favorite_clf.fit(X_train, y_train)
test_predictions = favorite_clf.predict_proba(test)",Yes,3,7.0
"submission = pd.DataFrame({<br>        ""listing_id"": index,<br>        ""high"": test_predictions[:,0],<br>        ""medium"":test_predictions[:,2],<br>        ""low"":test_predictions[:,1]<br>    })<br>    <br>columnsTitles=[""listing_id"",""high"",""medium"",""low""]<br>submission=submission.reindex(columns=columnsTitles)<br>submission.to_csv(\'submission.csv\', index=False)",Yes,4,25.0
"from keras.models import Sequential
from keras.layers import Dense",No,5,22.0
"from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)",No,5,20.0
"# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X.values, dummy_y, nb_epoch=10, batch_size=10)",No,4,7.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,5,88.0
"sub = pd.read_csv('../input/sample_submission.csv')
sub.head()",Yes,4,45.0
"df = pd.read_json('../input/train.json')
df.tail()",Yes,4,44.0
"df['address'] = df['display_address'].astype('category').cat.codes
df['street_address'] = df['street_address'].astype('category').cat.codes
df['building_id'] = df['building_id'].astype('category').cat.codes
df['manager_id'] = df['manager_id'].astype('category').cat.codes
df['num_features'] = df['features'].apply(len)
df['created'] = pd.to_datetime(df['created'])
df['created_year'] = df['created'].dt.year.astype('category').cat.codes
df['created_month'] = df['created'].dt.month.astype('category').cat.codes
df['len_description'] = df['description'].apply(lambda x: len(x.split(' ')))
df['num_pics'] = df['photos'].apply(len)",No,5,8.0
"new_feat = ['price','address','manager_id','building_id',
            'num_features','created_year','created_month',
            'len_description','latitude','longitude','num_pics']

#new_feat = ['price','latitude','longitude','num_pics',
 #           'num_features','created_year','created_month','len_description']
X = df[new_feat].fillna(0)
y = df['interest_level'].astype('category').cat.codes
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=9)
X.tail()",Yes,3,21.0
"clf1 = GradientBoostingClassifier(n_estimators=200, max_depth=9)
clf2 = AdaBoostClassifier(n_estimators=200)
clf3 = RandomForestClassifier(n_estimators=300)

estimators = [('gb', clf1), ('ab', clf2), ('rf', clf3)]
vclf = VotingClassifier(estimators=estimators, voting='soft', n_jobs= -1)

vclf.fit(X_train, y_train)
y_val_pred = vclf.predict_proba(X_val)
log_loss(y_val, y_val_pred)",Yes,3,4.0
"X_train = df[new_feat].fillna(0)
y_train = df['interest_level']
vclf.fit(X_train, y_train)


df2 = pd.read_json('../input/test.json')
df2['address'] = df2['display_address'].astype('category').cat.codes
df2['street_address'] = df2['street_address'].astype('category').cat.codes
df2['building_id'] = df2['building_id'].astype('category').cat.codes
df2['manager_id'] = df2['manager_id'].astype('category').cat.codes
df2['num_features'] = df2['features'].apply(len)
df2['created'] = pd.to_datetime(df2['created'])
df2['created_year'] = df2['created'].dt.year.astype('category').cat.codes
df2['created_month'] = df2['created'].dt.month.astype('category').cat.codes
df2['len_description'] = df2['description'].apply(lambda x: len(x.split(' ')))
df2['num_pics'] = df2['photos'].apply(len)

X = df2[new_feat].fillna(0)
y = vclf.predict_proba(X)",Yes,3,8.0
"sub = pd.read_csv('submissionVoting.csv')
sub.head()",Yes,4,45.0
"from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss",No,5,22.0
"df = pd.read_json(open(""../input/train.json"", ""r""))",No,5,44.0
df.head(),No,5,41.0
"df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day",No,5,8.0
"num_feats = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",
             ""num_photos"", ""num_features"", ""num_description_words"",
             ""created_year"", ""created_month"", ""created_day""]
X = df[num_feats]
y = df[""interest_level""]
X.head()",Yes,4,21.0
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)",No,5,13.0
"clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)",Yes,3,4.0
"df = pd.read_json(open(""../input/test.json"", ""r""))
df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day
X = df[num_feats]

y = clf.predict_proba(X)",Yes,4,8.0
"import numpy as np
import pandas as pd
# Look! No scikit learn!",No,5,22.0
"df_train = pd.read_json(open(""../input/train.json"", ""r""))
df_train.set_index(""listing_id"", inplace=True)
df_test  = pd.read_json(open(""../input/test.json"", ""r""))
df_test.set_index(""listing_id"", inplace=True)
# We will work with a concatenation of the two, then split after the scaling.
df = pd.concat([df_train, df_test])",Yes,3,44.0
"df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
#df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day",No,5,8.0
"df[""logprice""] = np.log(df.price)",No,5,8.0
"df.loc[df.bathrooms == 112, ""bathrooms""] = 1",No,5,14.0
"numeric_feat = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""logprice"",
             ""num_photos"", ""num_features"", ""num_description_words"",
              ""created_month"", ""created_day""]
for col in numeric_feat:
    df[col] -= df[col].min()
    df[col] /= df[col].max()",No,5,18.0
"X_train = df.loc[df.interest_level.notnull(), numeric_feat]
y_train = pd.get_dummies(df_train[[""interest_level""]], prefix="""")
y_train = y_train[[""_high"", ""_medium"", ""_low""]]  # Set the order according to submission
X_test  = df.loc[df.interest_level.isnull(), numeric_feat]",Yes,3,14.0
"## A dead simple neural network class in Python+Numpy. Plain SGD, and no regularization.
def sigmoid(X):
    return 1.0 / ( 1.0 + np.exp(-X) )

def softmax(X):
    _sum = np.exp(X).sum()
    return np.exp(X) / _sum

class neuralnet(object):
    def __init__(self, num_input, num_hidden, num_output):
        self._W1 = (np.random.random_sample((num_input, num_hidden)) - 0.5).astype(np.float32)
        self._b1 = np.zeros((1, num_hidden)).astype(np.float32)
        self._W2 = (np.random.random_sample((num_hidden, num_output)) - 0.5).astype(np.float32)
        self._b2 = np.zeros((1, num_output)).astype(np.float32)

    def forward(self,X):
        net1 = np.matmul( X, self._W1 ) + self._b1
        y = sigmoid(net1)
        net2 = np.matmul( y, self._W2 ) + self._b2
        z = softmax(net2)
        return z,y

    def backpropagation(self, X, target, eta):
        z, y = self.forward(X)
        d2 = (z - target)
        d1 = y*(1.0-y) * np.matmul(d2, self._W2.T)
        # The updates are done within this method. This more or less implies
        # utpdates with Stochastic Gradient Decent. Let's fix that later.
        # TODO: Support for full batch and mini-batches etc.
        self._W2 -= eta * np.matmul(y.T,d2)
        self._W1 -= eta * np.matmul(X.reshape((-1,1)),d1)
        self._b2 -= eta * d2
        self._b1 -= eta * d1",No,5,4.0
"# Some hyper-parameters to tune.
num_hidden = 17    # I think I get about 1 epoch/sec with this size on the docker instance
n_epochs   = 100
eta        = 0.01",No,5,59.0
"import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
",No,5,22.0
"data_path = ""../input/""
train_file = data_path + ""train.json""
test_file = data_path + ""test.json""
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)
",Yes,3,44.0
"# count of photos #
train_df[""num_photos""] = train_df[""photos""].apply(len)
test_df[""num_photos""] = test_df[""photos""].apply(len)

# count of ""features"" #
train_df[""num_features""] = train_df[""features""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)

# count of words present in description column #
train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))

# convert the created column to datetime object so as to extract more features 
train_df[""created""] = pd.to_datetime(train_df[""created""])
test_df[""created""] = pd.to_datetime(test_df[""created""])

# Let us extract some features like year, month, day, hour from date columns #
train_df[""created_year""] = train_df[""created""].dt.year
test_df[""created_year""] = test_df[""created""].dt.year
train_df[""created_month""] = train_df[""created""].dt.month
test_df[""created_month""] = test_df[""created""].dt.month
train_df[""created_day""] = train_df[""created""].dt.day
test_df[""created_day""] = test_df[""created""].dt.day
train_df[""created_hour""] = train_df[""created""].dt.hour
test_df[""created_hour""] = test_df[""created""].dt.hour

# adding all these new features to use list #
features_to_use.extend([""num_photos"", ""num_features"", ""num_description_words"",""created_year"", ""created_month"", ""created_day"", ""listing_id"", ""created_hour""])
",No,4,8.0
"categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]<br>for f in categorical:<br>        if train_df[f].dtype==\'object\':<br>            #print(f)<br>            lbl = preprocessing.LabelEncoder()<br>            lbl.fit(list(train_df[f].values) + list(test_df[f].values))<br>            train_df[f] = lbl.transform(list(train_df[f].values))<br>            test_df[f] = lbl.transform(list(test_df[f].values))<br>            features_to_use.append(f)<br>",No,5,20.0
"train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>print(train_df[""features""].head())<br>tfidf = CountVectorizer(stop_words=\'english\', max_features=200)<br>tr_sparse = tfidf.fit_transform(train_df[""features""])<br>te_sparse = tfidf.transform(test_df[""features""])<br>",Yes,3,8.0
"train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)
",Yes,3,11.0
"import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer",No,5,22.0
"data_path = ""../input/""
train_file = data_path + ""train.json""
test_file = data_path + ""test.json""
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)",Yes,4,44.0
"# count of photos #
train_df[""num_photos""] = train_df[""photos""].apply(len)
test_df[""num_photos""] = test_df[""photos""].apply(len)

# count of ""features"" #
train_df[""num_features""] = train_df[""features""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)

# count of words present in description column #
train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))

# convert the created column to datetime object so as to extract more features 
train_df[""created""] = pd.to_datetime(train_df[""created""])
test_df[""created""] = pd.to_datetime(test_df[""created""])

# Let us extract some features like year, month, day, hour from date columns #
train_df[""created_year""] = train_df[""created""].dt.year
test_df[""created_year""] = test_df[""created""].dt.year
train_df[""created_month""] = train_df[""created""].dt.month
test_df[""created_month""] = test_df[""created""].dt.month
train_df[""created_day""] = train_df[""created""].dt.day
test_df[""created_day""] = test_df[""created""].dt.day
train_df[""created_hour""] = train_df[""created""].dt.hour
test_df[""created_hour""] = test_df[""created""].dt.hour

# adding all these new features to use list #
features_to_use.extend([""num_photos"", ""num_features"", ""num_description_words"",""created_year"", ""created_month"", ""created_day"", ""listing_id"", ""created_hour""])",No,4,8.0
"categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]<br>for f in categorical:<br>        if train_df[f].dtype==\'object\':<br>            #print(f)<br>            lbl = preprocessing.LabelEncoder()<br>            lbl.fit(list(train_df[f].values) + list(test_df[f].values))<br>            train_df[f] = lbl.transform(list(train_df[f].values))<br>            test_df[f] = lbl.transform(list(test_df[f].values))<br>            features_to_use.append(f)",No,5,20.0
"train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>print(train_df[""features""].head())<br>tfidf = CountVectorizer(stop_words=\'english\', max_features=200)<br>tr_sparse = tfidf.fit_transform(train_df[""features""])<br>te_sparse = tfidf.transform(test_df[""features""])",Yes,4,8.0
"train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)",Yes,3,11.0
"out_df.to_csv(""xgb_starter2.csv"", index=False)",No,5,25.0
"df = pd.read_json(open(""../input/train.json"", \'r\'))<br>df.head()",Yes,4,44.0
"df['num_photos'] = df['photos'].apply(len)
df['num_features'] = df['features'].apply(len)
df['num_description_words'] = df['description'].apply(lambda x: len(x.split(' ')))
df['created'] = pd.to_datetime(df['created'])
df['created_year'] = df['created'].dt.year
df['created_month'] = df['created'].dt.month
df['created_day'] = df['created'].dt.day
df['created_hour'] = df['created'].dt.hour
df['created_minute'] = df['created'].dt.minute",No,5,8.0
"# price: removing values in 99 percentile
df = remove_outlier(df, 'price', [99])

# Latitude & Longitude:
# removing outliers: values in the 1/99 percentiles
df = remove_outlier(df, 'latitude', [1, 99])
df = remove_outlier(df, 'longitude', [1, 99])",No,4,8.0
"num_feats = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'num_photos',
             'num_features', 'num_description_words', 'created_year', 'created_month',
             'created_day', 'created_hour', 'created_minute']
X = df[num_feats]
y = df['interest_level']
X.head()",Yes,4,21.0
"X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33,
                                                    random_state=0)",No,5,13.0
"clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_test)
log_loss(y_test, y_val_pred)",Yes,3,4.0
"# fitting the model on the entire data without split
clf.fit(X, y)",No,5,7.0
"df = pd.read_json(open(""../input/test.json"", \'r\'))<br>df[\'num_photos\'] = df[\'photos\'].apply(len)<br>df[\'num_features\'] = df[\'features\'].apply(len)<br>df[\'num_description_words\'] = df[\'description\'].apply(lambda x: len(x.split(\' \')))<br>df[\'created\'] = pd.to_datetime(df[\'created\'])<br>df[\'created_year\'] = df[\'created\'].dt.year<br>df[\'created_month\'] = df[\'created\'].dt.month<br>df[\'created_day\'] = df[\'created\'].dt.day<br>df[\'created_hour\'] = df[\'created\'].dt.hour<br>df[\'created_minute\'] = df[\'created\'].dt.minute<br>X = df[num_feats]",Yes,4,8.0
y = clf.predict_proba(X),No,5,27.0
"from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss",No,5,22.0
"clf = ExtraTreesClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)",Yes,3,4.0
"import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt<br>import seaborn as sns<br>from subprocess import check_output<br><br># Load and describe data<br># print(check_output([""ls"", ""../input""]).decode(""utf8""))<br>df = pd.read_json(open(""../input/train.json"", ""r""))<br>df[\'n_photos\'] = df[\'photos\'].apply(len)<br>df[\'n_features\'] = df[\'features\'].apply(len)<br>df[\'ilevel_categ\'] = df[\'interest_level\'].map({\'low\': 1, \'medium\': 2, \'high\': 3})<br>df[""n_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))<br>df.info()",No,4,8.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt #plotting library
%matplotlib inline
import seaborn as sns    #plotting library
sns.set(color_codes=True)
sns.set_style(""white"")
",No,5,23.0
"import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
",No,5,22.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br>train_data = pd.read_json(""../input/train.json"")<br>test_data = pd.read_json(""../input/test.json"")<br>display_count = 2<br>target = \'interest_level\'<br>",Yes,4,44.0
"train_data.iloc[0]
",No,5,14.0
"train_data['rooms'] = train_data['bedrooms'] + train_data['bathrooms']
train_data['living_rooms'] = train_data['bedrooms'] - train_data['bathrooms']
train_data['even_rooms'] = train_data['rooms'].apply(lambda x : (x%2) == 0)
",No,5,8.0
"def price_per_room(row):
    rooms = row['rooms']
    if rooms == 0:
        return -1
    price_per_room = row['price'] / rooms
    return price_per_room

train_data['price_per_room'] = train_data.apply(lambda row: price_per_room(row), axis=1)
",No,5,8.0
"train_data['created'] = pd.to_datetime(train_data['created'])
train_data['year'] = train_data['created'].dt.year
train_data['month'] = train_data['created'].dt.month
train_data['day'] = train_data['created'].dt.day
train_data['hour'] = train_data['created'].dt.hour

train_data['month'] = train_data['month'].apply(lambda x: '0' + str(x) if len(str(x)) == 1 else str(x))
train_data['day'] = train_data['day'].apply(lambda x: '0' + str(x) if len(str(x)) == 1 else str(x))
",No,5,8.0
"train_data['MMDD'] = train_data.apply(lambda x: str(x.month) + str(x.day), axis=1)
",No,5,8.0
"month_counts = train_data.groupby(['day', 'hour']).agg({target: 'count'})
month_counts = month_counts.sort_values(target, ascending=False)
",Yes,3,60.0
"train_data['months_with_less_listings'] = train_data.apply(lambda x: 0 if x['month'] == 31 else 1, axis=1)
",No,5,8.0
"build_group = train_data.groupby([target, 'building_id'])
",No,5,60.0
"buildings_with_all_listings = unstacked_df.ix[unstacked_df['frequency'] > 1]
",No,5,14.0
print(df.shape),No,5,58.0
"clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
print(y_val_pred.shape)
log_loss(y_val, y_val_pred)",Yes,3,4.0
"df = pd.read_json(open(""../input/test.json"", ""r""))
print(df.shape)
df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day
X = df[num_feats]

y = clf.predict_proba(X)",Yes,4,8.0
"sub = pd.DataFrame()
sub[""listing_id""] = df[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv(""submission_rf.csv"", index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br>from sklearn.ensemble import RandomForestClassifier<br>from sklearn.model_selection import train_test_split<br>from sklearn.metrics import log_loss<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.<br>",No,5,88.0
"df = pd.read_json(open(""../input/train.json"", ""r""))
df.head(5)
",Yes,3,44.0
"df.shape
",No,5,58.0
"numeric_features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price""]
X = df[numeric_features]
y = df[""interest_level""]
X.head()
",No,3,21.0
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

random_forest_classifier = RandomForestClassifier(n_estimators=1500)
random_forest_classifier.fit(X_train, y_train)
y_val_pred = random_forest_classifier.predict_proba(X_val)
log_loss(y_val, y_val_pred)
",Yes,3,13.0
"df_test = pd.read_json(open(""../input/test.json"", ""r""))
X_test = df_test[numeric_features]

y_test = random_forest_classifier.predict_proba(X_test)
",Yes,3,44.0
"plt.figure(figsize=(8, 4))<br>price_group.head(5)[""total_price""].plot(kind=\'barh\',color=""orange"")<br>plt.show()<br>plt.close()<br>",No,4,33.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.<br>",No,5,88.0
"train_data[\'building_id\'] = train_data[\'building_id\'].apply(lambda x: 1 if x != ""0"" else 0)<br>",No,5,8.0
"# Read the training and test data
train_df = pd.read_json(""../input/train.json"")
test_df = pd.read_json(""../input/test.json"")

#Look at the size of test and train data
print(""train data shape: "", train_df.shape[0]);
print(""test data shape: "", test_df.shape[0]);
",Yes,3,44.0
"def split_X_y(train_data, features):
    X = train_data[features]
    y = train_data[target]
    return X, y
",No,4,21.0
"# Convert the features like features, photos, description into numeric by computing their length<br># Generate hash for the building_id, manager_id<br>train_df[""num_photos""] = train_df[""photos""].apply(len)<br>train_df[""num_features""] = train_df[""features""].apply(len)<br>train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))<br>train_df[""building_gen_id""] = train_df[""building_id""].apply(lambda x: x.encode(\'utf-8\'))<br>train_df[""building_gen_id""] = train_df[""building_gen_id""].apply(lambda x: string2numeric_hash(x))<br>train_df[""manager_gen_id""] = train_df[""manager_id""].apply(lambda x: x.encode(\'utf-8\'))<br>train_df[""manager_gen_id""] = train_df[""manager_gen_id""].apply(lambda x: string2numeric_hash(x))<br><br><br>test_df[""num_photos""] = test_df[""photos""].apply(len)<br>test_df[""num_features""] = test_df[""features""].apply(len)<br>test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))<br>test_df[""building_gen_id""] = test_df[""building_id""].apply(lambda x: x.encode(\'utf-8\'))<br>test_df[""building_gen_id""] = test_df[""building_gen_id""].apply(lambda x: string2numeric_hash(x))<br>test_df[""manager_gen_id""] = test_df[""manager_id""].apply(lambda x: x.encode(\'utf-8\'))<br>test_df[""manager_gen_id""] = test_df[""manager_gen_id""].apply(lambda x: string2numeric_hash(x))<br>",No,5,8.0
"# Select the features and prepare the Input and target variables
selected_features = [""bedrooms"", ""bathrooms"", ""price"", ""num_photos"", ""num_features"", ""num_description_words"", ""building_gen_id"", ""manager_gen_id""]

X = train_df[selected_features]
Y = train_df[""interest_level""]

X.head()
",Yes,4,21.0
"#Split the input into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.33)
#Pass the input to the algo and calculate the loss
from sklearn.ensemble import RandomForestClassifier
algo = RandomForestClassifier(n_estimators=100)
algo.fit(X_train, Y_train)
y_predict_val = algo.predict_proba(X_val)
from sklearn.metrics import log_loss
log_loss(Y_val, y_predict_val)
",Yes,3,13.0
"X_test = test_df[selected_features]
y_predict_test = algo.predict_proba(X_test)
",No,5,48.0
"sub = pd.DataFrame()
sub[""listing_id""] = test_df[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y_predict_test[:, labels2idx[label]]
sub.to_csv(""categoral_numeric.csv"", index=False)
print(""process is done1"")
",No,5,25.0
"import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import random
from math import exp
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from collections import defaultdict, Counter


random.seed(4321)
np.random.seed(4321)

train_df = pd.read_json(""../input/train.json"")
test_df = pd.read_json(""../input/test.json"")
train_test = pd.concat([train_df, test_df], 0)
",No,4,44.0
"print(train_df.shape)
print(test_df.shape)
",No,5,58.0
"train_df.head(5)
",No,5,41.0
"df = pd.read_json(open(""../input/train.json"", ""r""))

df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day
",Yes,4,8.0
"num_feats = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",
             ""num_photos"", ""num_features"", ""num_description_words"",
             ""created_year"", ""created_month"", ""created_day""]
X = df[num_feats]
y = df[""interest_level""]
X.head()
",Yes,4,21.0
"X_train.loc[10000,""photos""]
",No,5,14.0
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)
",Yes,3,13.0
"df = pd.read_json(open(""../input/test.json"", ""r""))
print(df.shape)
df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day
X = df[num_feats]

y = clf.predict_proba(X)
",Yes,4,8.0
"sub = pd.DataFrame()
sub[""listing_id""] = df[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv(""submission_rf.csv"", index=False)
",No,5,25.0
"%matplotlib inline<br>import matplotlib.pylab as plt<br>import numpy as np<br>import pandas as pd<br>import time as time<br>from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer<br>from sklearn.metrics import log_loss<br>from sklearn.neural_network import MLPClassifier<br>from sklearn.grid_search import GridSearchCV<br>from sklearn.model_selection import StratifiedKFold<br>from sklearn.cluster import KMeans<br>from sklearn_pandas import DataFrameMapper<br>from sklearn.pipeline import make_pipeline<br><br>def get_skf_indexes(df, target, kfold=4):<br>    X = df.values<br>    y = df[target].values<br>    skf = StratifiedKFold(n_splits=4);<br>    skf.get_n_splits(X, y);<br>    indexes = [[],[]]<br>    for train_index, test_index in skf.split(X, y):<br>        indexes[0].append(train_index)<br>        indexes[1].append(test_index)<br>    return indexes<br><br><br>def output_results(clf, x_test, listing, fname):<br>    preds = clf.predict_proba(x_test)<br>    preds = pd.DataFrame(preds)<br>    cols = [\'low\', \'medium\', \'high\']<br>    preds.columns = cols<br>    preds[\'listing_id\'] = listing<br>    preds.to_csv(fname, index=None)<br>    print(preds[cols].mean().values)<br><br><br>def basic_preprocess(df_train, df_test, n_min=50, precision=3):<br>    <br>    # Interest: Numerical encoding of interest level<br>    df_train[\'y\'] = 0.0<br>    df_train.loc[df_train.interest_level==\'medium\', \'y\'] = 1.0<br>    df_train.loc[df_train.interest_level==\'high\', \'y\'] = 2.0<br>    <br>    # Location features: Latitude, longitude<br>    df_train[\'num_latitude\'] = df_train.latitude.values<br>    df_test[\'num_latitude\'] = df_test.latitude.values<br>    df_train[\'num_longitude\'] = df_train.longitude.values<br>    df_test[\'num_longitude\'] = df_test.longitude.values<br>    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2)<br>    df_train[\'num_dist_from_center\'] = x.values<br>    x = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2)<br>    df_test[\'num_dist_from_center\'] = x.values<br>    df_train[\'pos\'] = df_train.longitude.round(precision).astype(str) + \'_\' + df_train.latitude.round(precision).astype(str)<br>    df_test[\'pos\'] = df_test.longitude.round(precision).astype(str) + \'_\' + df_test.latitude.round(precision).astype(str)<br>    <br>    # Degree of ""outlierness""<br>    OutlierAggregated = (df_train.bedrooms > 4).astype(float)<br>    OutlierAggregated2 = (df_test.bedrooms > 4).astype(float)<br>    OutlierAggregated += (df_train.bathrooms > 3).astype(float)<br>    OutlierAggregated2 += (df_test.bathrooms > 3).astype(float)<br>    OutlierAggregated += (df_train.bathrooms < 1).astype(float)<br>    OutlierAggregated2 += (df_test.bathrooms < 1).astype(float)<br>    x = np.abs((df_train.price - df_train.price.median())/df_train.price.std()) > 0.30<br>    OutlierAggregated += x.astype(float)<br>    x2 = np.abs((df_test.price - df_train.price.median())/df_train.price.std()) > 0.30<br>    OutlierAggregated2 += x2.astype(float)<br>    x = np.log1p(df_train.price/(df_train.bedrooms.clip(1,3) + df_train.bathrooms.clip(1,2))) > 8.2<br>    OutlierAggregated += x.astype(float)<br>    x2 = np.log1p(df_test.price/(df_test.bedrooms.clip(1,3) + df_test.bathrooms.clip(1,2))) > 8.2<br>    OutlierAggregated2 += x2.astype(float)<br>    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2) > 0.30<br>    OutlierAggregated += x.astype(float)<br>    x2 = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2) > 0.30<br>    OutlierAggregated2 += x2.astype(float)<br>    df_train[\'num_OutlierAggregated\'] = OutlierAggregated.values<br>    df_test[\'num_OutlierAggregated\'] = OutlierAggregated2.values<br>    <br>    # Average interest in unique locations at given precision<br>    x = df_train.groupby(\'pos\')[\'y\'].aggregate([\'count\', \'mean\'])<br>    d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()<br>    impute = df_train.y.mean()<br>    df_train[\'num_pos\'] = df_train.pos.apply(lambda x: d.get(x, impute))<br>    df_test[\'num_pos\'] = df_test.pos.apply(lambda x: d.get(x, impute))<br>    <br>    # Density in unique locations at given precision<br>    vals = df_train[\'pos\'].value_counts()<br>    dvals = vals.to_dict()<br>    df_train[\'num_pos_density\'] = df_train[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br>    df_test[\'num_pos_density\'] = df_test[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br><br>    # Building null<br>    df_train[\'num_building_null\'] = (df_train.building_id==\'0\').astype(float)<br>    df_test[\'num_building_null\'] = (df_test.building_id==\'0\').astype(float)<br>    <br>    # Building supervised<br>    x = df_train.groupby(\'building_id\')[\'y\'].aggregate([\'count\', \'mean\'])<br>    d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()<br>    impute = df_train.y.mean()<br>    df_train[\'num_building_id\'] = df_train.building_id.apply(lambda x: d.get(x, impute))<br>    df_test[\'num_building_id\'] = df_test.building_id.apply(lambda x: d.get(x, impute))<br>    <br>    # Building frequency<br>    d = np.log1p(df_train.building_id.value_counts()).to_dict()<br>    impute = np.min(np.array(list(d.values())))<br>    df_train[\'num_fbuilding\'] = df_train.building_id.apply(lambda x: d.get(x, impute))<br>    df_test[\'num_fbuilding\'] = df_test.building_id.apply(lambda x: d.get(x, impute))<br>    <br>    # Manager supervised<br>    x = df_train.groupby(\'manager_id\')[\'y\'].aggregate([\'count\', \'mean\'])<br>    d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()<br>    impute = df_train.y.mean()<br>    df_train[\'num_manager\'] = df_train.manager_id.apply(lambda x: d.get(x, impute))<br>    df_test[\'num_manager\'] = df_test.manager_id.apply(lambda x: d.get(x, impute))<br><br>    # Manager frequency<br>    d = np.log1p(df_train.manager_id.value_counts()).to_dict()<br>    impute = np.min(np.array(list(d.values())))<br>    df_train[\'num_fmanager\'] = df_train.manager_id.apply(lambda x: d.get(x, impute))<br>    df_test[\'num_fmanager\'] = df_test.manager_id.apply(lambda x: d.get(x, impute))<br>    <br>    # Creation time features<br>    df_train[\'created\'] = pd.to_datetime(df_train.created)<br>    df_train[\'num_created_weekday\'] = df_train.created.dt.dayofweek.astype(float)<br>    df_train[\'num_created_weekofyear\'] = df_train.created.dt.weekofyear<br>    df_test[\'created\'] = pd.to_datetime(df_test.created)<br>    df_test[\'num_created_weekday\'] = df_test.created.dt.dayofweek<br>    df_test[\'num_created_weekofyear\'] = df_test.created.dt.weekofyear<br>    <br>    # Bedrooms/Bathrooms/Price<br>    df_train[\'num_bathrooms\'] = df_train.bathrooms.clip_upper(4)<br>    df_test[\'num_bathrooms\'] = df_test.bathrooms.clip_upper(4)<br>    df_train[\'num_bedrooms\'] = df_train.bedrooms.clip_upper(5)<br>    df_test[\'num_bedrooms\'] = df_test.bedrooms.clip_upper(5)<br>    df_train[\'num_price\'] = df_train.price.clip_upper(10000)<br>    df_test[\'num_price\'] = df_test.price.clip_upper(10000)<br>    bins = df_train.price.quantile(np.arange(0.05, 1, 0.05))<br>    df_train[\'num_price_q\'] = np.digitize(df_train.price, bins)<br>    df_test[\'num_price_q\'] = np.digitize(df_test.price, bins)<br>    <br>    # Composite features based on: <br>    # https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths<br>    df_train[\'num_priceXroom\'] = (df_train.price / (1 + df_train.bedrooms.clip(1, 4) + 0.5*df_train.bathrooms.clip(0, 2))).values<br>    df_test[\'num_priceXroom\'] = (df_test.price / (1 + df_test.bedrooms.clip(1, 4) + 0.5*df_test.bathrooms.clip(0, 2))).values<br>    df_train[\'num_even_bathrooms\'] = ((np.round(df_train.bathrooms) - df_train.bathrooms)==0).astype(float)<br>    df_test[\'num_even_bathrooms\'] = ((np.round(df_test.bathrooms) - df_test.bathrooms)==0).astype(float)<br>    <br>    # Other features<br>    df_train[\'num_features\'] = df_train.features.apply(lambda x: len(x))<br>    df_test[\'num_features\'] = df_test.features.apply(lambda x: len(x))<br>    df_train[\'num_photos\'] = df_train.photos.apply(lambda x: len(x))<br>    df_test[\'num_photos\'] = df_test.photos.apply(lambda x: len(x))<br>    df_train[\'num_desc_length\'] = df_train.description.str.split(\' \').str.len()<br>    df_test[\'num_desc_length\'] = df_test.description.str.split(\' \').str.len()<br>    df_train[\'num_desc_length_null\'] = (df_train.description.str.len()==0).astype(float)<br>    df_test[\'num_desc_length_null\'] = (df_test.description.str.len()==0).astype(float)<br>    <br>    # Features/Description Features<br>    bows = {\'nofee\': [\'no fee\', \'no-fee\', \'no  fee\', \'nofee\', \'no_fee\'],<br>            \'lowfee\': [\'reduced_fee\', \'low_fee\',\'reduced fee\', \'low fee\'],<br>            \'furnished\': [\'furnished\'],<br>            \'parquet\': [\'parquet\', \'hardwood\'],<br>            \'concierge\': [\'concierge\', \'doorman\', \'housekeep\',\'in_super\'],<br>            \'prewar\': [\'prewar\', \'pre_war\', \'pre war\', \'pre-war\'],<br>            \'laundry\': [\'laundry\', \'lndry\'],<br>            \'health\': [\'health\', \'gym\', \'fitness\', \'training\'],<br>            \'transport\': [\'train\', \'subway\', \'transport\'],<br>            \'parking\': [\'parking\'],<br>            \'utilities\': [\'utilities\', \'heat water\', \'water included\']<br>          }<br>    for fname, bow in bows.items():<br>        x1 = df_train.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))<br>        x2 = df_train.features.apply(lambda x: np.sum([1 for i in bow if i in \' \'.join(x).lower()]))<br>        df_train[\'num_\'+fname] = ((x1 + x2) > 0).astype(float).values<br>        x1 = df_test.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))<br>        x2 = df_test.features.apply(lambda x: np.sum([1 for i in bow if i in \' \'.join(x).lower()]))<br>        df_test[\'num_\'+fname] = ((x1 + x2) > 0).astype(float).values<br><br>    return df_train, df_test",No,5,53.0
"def build_model(name):
    clf = None
    if name == 'Random Forest':
        clf = RandomForestClassifier()
    if name == 'gbm':
        clf = GradientBoostingClassifier()
    return clf
",No,5,4.0
"def fit_model(clf, X_train, y_train):
    return clf.fit(X_train, y_train)
",No,5,7.0
"df = pd.read_json('../input/train.json')
df_test = pd.read_json('../input/test.json')
df['created'] = pd.to_datetime(df.created)
df_test['created'] = pd.to_datetime(df_test.created)",Yes,3,44.0
"def model_and_predict(model_name, data, features):
    X, y = split_X_y(data, features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

    clf = build_model(model_name)
    clf = fit_model(clf, X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(""The accuracy is {}"".format(accuracy))
    
    y_proba = clf.predict_proba(X_test)
    log_loss_score = log_loss(y_test, y_proba)
    print(""The log_loss_score is {}"".format(log_loss_score))
    return clf, accuracy, log_loss_score
",Yes,3,48.0
"# Normalize
for i in range(x_train.shape[1]):
    x_test[:, i] = (x_test[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])
    x_train[:, i] = (x_train[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])",No,5,18.0
"train_features = ['bathrooms', 'rooms', 'living_rooms', 'building_id', 'price_level',
                 'months_with_less_listings']
clf, accuracy, log_score = model_and_predict('Random Forest', train_copy, train_features)
scores.append(accuracy)
log_scores.append(log_score)
",Yes,4,27.0
"test_data[\'rooms\'] = test_data[\'bedrooms\'] + test_data[\'bathrooms\']<br>test_data[\'living_rooms\'] = test_data[\'bedrooms\'] - test_data[\'bathrooms\']<br>test_data[\'even_rooms\'] = test_data[\'rooms\'].apply(lambda x : (x%2) == 0)<br><br>test_data[\'created\'] = pd.to_datetime(test_data[\'created\'])<br>test_data[\'year\'] = test_data[\'created\'].dt.year<br>test_data[\'month\'] = test_data[\'created\'].dt.month<br>test_data[\'day\'] = test_data[\'created\'].dt.day<br>test_data[\'hour\'] = test_data[\'created\'].dt.hour<br><br>test_data[\'price_per_room\'] = test_data.apply(lambda row: price_per_room(row), axis=1)<br>test_data[\'price_level\'] = test_data[\'price\'].apply(lambda x: 1 if x<2000 else 0)<br>test_data[\'building_id\'] = test_data[\'building_id\'].apply(lambda x: 1 if x != ""0"" else 0)<br><br>test_data[\'months_with_less_listings\'] = test_data.apply(lambda x: 0 if x[\'month\'] == 31 else 1, axis=1)<br>",No,5,8.0
"X_test = test_data[train_features]
y_proba = clf.predict_proba(X_test)
",No,5,48.0
"import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')",No,5,44.0
"#basic features
train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""]
test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""] 
train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""] 
test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""] 

# count of photos #
train_df[""num_photos""] = train_df[""photos""].apply(len)
test_df[""num_photos""] = test_df[""photos""].apply(len)

# count of ""features"" #
train_df[""num_features""] = train_df[""features""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)

# count of words present in description column #
train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))


features_to_use=[""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",""price_t"",""num_photos"", ""num_features"", ""num_description_words"",""listing_id""]",No,5,8.0
"
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)",Yes,4,21.0
"results_df = pd.DataFrame(results)
",No,5,12.0
"result = pd.concat([results_df, id_df], axis=1)
",No,5,11.0
"train[\'display_address\'] = train[\'display_address\'].apply(lambda x: x.strip("".""))<br>train[\'display_address\']  = train[\'display_address\'].apply(lambda x: x.lower())<br>ga = train.groupby([\'display_address\'])[\'display_address\'].count().fillna(0)<br>ga = pd.DataFrame(ga)<br>ga.columns = [\'display_count\']<br>ga[\'display_address\'] = ga.index<br>ga.loc[ga[\'display_address\'] == \'\',\'display_count\'] = 0<br>pd.DataFrame(ga)",No,3,8.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>from sklearn.ensemble import GradientBoostingClassifier<br>from sklearn.metrics import log_loss<br>from sklearn.model_selection import train_test_split<br><br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from subprocess import check_output<br>print(check_output([""ls"", ""../input""]).decode(""utf8""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"train = pd.read_json('../input/train.json')
### and test if everything OK
train.head()",Yes,3,44.0
"train['num_photos'] = train['photos'].apply(len)
train['num_features'] = train['features'].apply(len)
train['num_description_words'] = train['description'].apply(lambda x: len(x.split(' ')))
train['rooms'] = train['bathrooms'] + train['bedrooms']

ulimit = np.percentile(train.price.values, 99)
train['price'].loc[train['price']>ulimit] = ulimit
train['rooms_per_price'] = train['rooms']/train['price']
train = train[train['bedrooms'] > 0]
train['bath_per_beds'] = train['bathrooms']/train['bedrooms']
train.loc[train['bath_per_beds'] > 999999999999,'bath_per_beds'] = 0",No,5,8.0
"X  = train[['bathrooms','bedrooms','price','num_photos',
            'num_features','num_description_words','rooms','rooms_per_price',
            'bath_per_beds','latitude','longitude','building_count','manager_count','display_count']]
y = train['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=7)",Yes,4,21.0
"gbc = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=600, 
                                  subsample=1.0, criterion='friedman_mse', min_samples_split=2, 
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                  max_depth=5, init=None, random_state=None, 
                                  max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, 
                                  presort='auto')
gbc.fit(X,y)
out = gbc.predict_proba(X_test)",Yes,4,4.0
"print(log_loss(y_test,out))",No,5,49.0
"
test.loc[test['bedrooms'] == 0,'bath_per_beds'] = 0
X  = test[['bathrooms','bedrooms','price','num_photos',
            'num_features','num_description_words','rooms','room_per_price',
            'bath_per_beds','latitude','longitude','building_count','manager_count','display_count']]
X = X.fillna(0)",No,5,17.0
out = gbc.predict_proba(X),No,5,48.0
"def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):<br>    param = {}<br>    param[\'objective\'] = \'multi:softprob\'<br>    param[\'eta\'] = 0.03<br>    param[\'max_depth\'] = 6<br>    param[\'silent\'] = 1<br>    param[\'num_class\'] = 3<br>    param[\'eval_metric\'] = ""mlogloss""<br>    param[\'min_child_weight\'] = 1<br>    param[\'subsample\'] = 0.7<br>    param[\'colsample_bytree\'] = 0.7<br>    param[\'seed\'] = seed_val<br>    num_rounds = num_rounds<br><br>    plst = list(param.items())<br>    xgtrain = xgb.DMatrix(train_X, label=train_y)<br><br>    if test_y is not None:<br>        xgtest = xgb.DMatrix(test_X, label=test_y)<br>        watchlist = [ (xgtrain,\'train\'), (xgtest, \'test\') ]<br>        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)<br>    else:<br>        xgtest = xgb.DMatrix(test_X)<br>        model = xgb.train(plst, xgtrain, num_rounds)<br><br>    pred_test_y = model.predict(xgtest)<br>    return pred_test_y, model",Yes,3,59.0
"result.head(3)
",No,5,41.0
"result.to_csv(""submission.csv"", index=False)
",No,5,25.0
"import numpy as np 
import pandas as pd ",No,5,22.0
"data_path = ""../input/""<br>train_file = data_path + ""train.json""<br>test_file = data_path + ""test.json""<br>train_df = pd.read_json(train_file)<br>test_df = pd.read_json(test_file)<br><br>train_df = train_df.fillna(\'\')<br>test_df = test_df.fillna(\'\')<br><br>train_df[\'photos_num\'] = train_df.photos.apply(lambda x: len(x))<br>test_df[\'photos_num\'] = test_df.photos.apply(lambda x: len(x))<br><br>train_df[\'features_num\'] = train_df.features.apply(lambda x: len(x))<br>test_df[\'features_num\'] = test_df.features.apply(lambda x: len(x))<br><br>print(\'Shape of train dataset = \' + str(train_df.shape))<br>print(\'Shape of test dataset = \' + str(test_df.shape))",Yes,3,8.0
"cols = ['bathrooms', 'bedrooms', 'building_id', \\<br>        'description', 'display_address', 'latitude', \\<br>        'longitude', 'manager_id', 'price', 'street_address', \\<br>        'photos_num', 'features_num']<br>df_merged = pd.merge(train_df, test_df, \\<br>                     on=cols, \\<br>                     suffixes=('_train', '_test'), how='right')<br>df_merged = df_merged.rename(columns={'listing_id_test': 'listing_id'})<br>df_merged.head()",Yes,3,32.0
"fname = 'sample_submission.csv'
subm = pd.read_csv(data_path + fname)
subm = subm.merge(df_merged[['listing_id','interest_level']], on='listing_id')",Yes,4,45.0
print('Number of duplicates = ' + str(np.sum(subm.duplicated(subset='listing_id')))),No,4,38.0
"subm.sort_values('listing_id').loc[subm.duplicated(subset='listing_id', keep=False)].head(10)",Yes,4,41.0
"print('Number of duplicates in train = ' + \\<br>      str(np.sum(train_df.duplicated(subset=cols, keep=False))))<br>print('Number of duplicates in test = ' + \\<br>      str(np.sum(test_df.duplicated(subset=cols, keep=False))))",No,5,38.0
"subm.low.loc[subm.interest_level=='low'] = 1.0
subm.medium.loc[subm.interest_level=='low'] = 0.0
subm.high.loc[subm.interest_level=='low'] = 0.0

subm.low.loc[subm.interest_level=='medium'] = 0.0
subm.medium.loc[subm.interest_level=='medium'] = 1.0
subm.high.loc[subm.interest_level=='medium'] = 0.0

subm.low.loc[subm.interest_level=='high'] = 0.0
subm.medium.loc[subm.interest_level=='high'] = 0.0
subm.high.loc[subm.interest_level=='high'] = 1.0

subm = subm.groupby('listing_id').mean()

print('subm.shape = ' + str(subm.shape))
subm.head()",Yes,4,14.0
"subm.to_csv('submission.csv', index=True)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt # for visualization<br>import seaborn as sns<br>from sklearn import linear_model<br>import re<br><br>from sklearn.preprocessing import LabelEncoder<br>from sklearn.model_selection import train_test_split<br>from sklearn.linear_model import LogisticRegression<br>from sklearn.ensemble import RandomForestClassifier<br>from sklearn.tree import DecisionTreeClassifier<br>from sklearn.metrics import confusion_matrix, classification_report, accuracy_score<br>from sklearn.model_selection import cross_val_score<br>from xgboost import XGBClassifier, plot_importance<br><br>%matplotlib inline<br>### Seaborn style<br>sns.set_style(""whitegrid"")<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>trainingData = pd.read_json(\'../input/train.json\')<br><br><br>#trainingData[\'building_id\'] = trainingData[\'building_id\'].to_string<br><br>trainingData.info()",Yes,4,44.0
trainingData.isnull().sum(),No,5,39.0
"sns.countplot(trainingData.interest_level, order=['low', 'medium', 'high']);
plt.xlabel('Interest Level');
plt.ylabel('Number of occurrences');",No,5,33.0
"
trainingData['numPics'] = trainingData['photos'].apply(len)

trainingData.info()",Yes,4,8.0
"features = [x for sublist in trainingData[\'features\'] for x in sublist]<br><br>for x in features:<br>    if ""*"" in x: features.remove(x)<br><br>features = set(features)<br><br>features.discard(\'<null>\')<br><br>has_ac = [ s for s in features if any(ac_name in s for ac_name in [\'A/C\', ""AC"", ""Air Conditioning""] ) and not(any(wrong in s for wrong in [""FIRE"",\'ACT\',\'APT\', \'SPACE\',\'YARD\'])) ]<br><br>trainingData[\'has_ac\'] = [any(ac in feature for ac in has_ac) for feature in trainingData[\'features\'] ]<br>  <br>free_included = [ s for s in features if any(ac_name in s for ac_name in [""free"",""FREE"",""Free"", ""Gift"", ""gift"", \'1/2 Month fee\', ""included"", ""INCLUDED"",""Included""] ) ]<br>    <br>trainingData[""included_offer""] = [any(free in feature for free in free_included) for feature in trainingData[\'features\']]<br><br>doorman = [ s for s in features if any(ac_name in s for ac_name in [""doorman"",""DOORMAN"",""Doorman"",\'doormen\',\'Doormen\', \'full-service\', \'concierge\',\'Concierge\',\'Attended Lobby\', \'Attended lobby\', \'attended lobby\'] ) ]<br>   <br>trainingData[""concierge""] = [any(door in feature for door in doorman) for feature in trainingData[\'features\']]<br><br>Washer = [ s for s in features if any(ac_name in s for ac_name in [\'Washer\', ""Dryer"",\'Washer\',\'Dryer\',\'washer\',\'dryer\',\'laundry\',\'LAUNDRY\',\'Laundry\'] ) and not(any(notname in s for notname in [\'dish\',\'DISH\',\'Dish\', \'Disw\'] )) ]<br><br>trainingData[""laundry""] = [any(laundry in feature for laundry in Washer) for feature in trainingData[\'features\']]",No,5,8.0
"labelEncoder = LabelEncoder()

trainingData['interest'] = labelEncoder.fit_transform(trainingData['interest_level'])

trainingDataSub = trainingData.loc[trainingData['interest']==0]

trainingDataSub = trainingDataSub.append(trainingData.loc[trainingData['interest']==1].sample(15000))

trainingDataSub = trainingDataSub.append(trainingData.loc[trainingData['interest']==2].sample(10000))


y = trainingDataSub['interest']

X = trainingDataSub[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'numPics', 'has_ac', 'included_offer', 'concierge', 'laundry']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=52)",Yes,3,21.0
trainingData.dtypes,No,5,70.0
"from sklearn import neural_network

regr = neural_network.MLPClassifier(hidden_layer_sizes = (50,50,10))

regr.fit(X_train, y_train)",Yes,4,4.0
"deepfor = RandomForestClassifier(n_estimators=3, random_state=52)

deepfor.fit(X_train, y_train)
",Yes,3,4.0
"testingData = pd.read_json('../input/test.json')

testingData['numPics'] = testingData['photos'].apply(len)",Yes,4,44.0
"features = [x for sublist in testingData[\'features\'] for x in sublist]<br><br>for x in features:<br>    if ""*"" in x: features.remove(x)<br><br>features = set(features)<br><br>features.discard(\'<null>\')<br><br>has_ac = [ s for s in features if any(ac_name in s for ac_name in [\'A/C\', ""AC"", ""Air Conditioning""] ) and not(any(wrong in s for wrong in [""FIRE"",\'ACT\',\'APT\', \'SPACE\',\'YARD\'])) ]<br><br>testingData[\'has_ac\'] = [any(ac in feature for ac in has_ac) for feature in testingData[\'features\'] ]<br>  <br>free_included = [ s for s in features if any(ac_name in s for ac_name in [""free"",""FREE"",""Free"", ""Gift"", ""gift"", \'1/2 Month fee\', ""included"", ""INCLUDED"",""Included""] ) ]<br>    <br>testingData[""included_offer""] = [any(free in feature for free in free_included) for feature in testingData[\'features\']]<br><br>doorman = [ s for s in features if any(ac_name in s for ac_name in [""doorman"",""DOORMAN"",""Doorman"",\'doormen\',\'Doormen\', \'full-service\', \'concierge\',\'Concierge\',\'Attended Lobby\', \'Attended lobby\', \'attended lobby\'] ) ]<br>   <br>testingData[""concierge""] = [any(door in feature for door in doorman) for feature in testingData[\'features\']]<br><br>Washer = [ s for s in features if any(ac_name in s for ac_name in [\'Washer\', ""Dryer"",\'Washer\',\'Dryer\',\'washer\',\'dryer\',\'laundry\',\'LAUNDRY\',\'Laundry\'] ) and not(any(notname in s for notname in [\'dish\',\'DISH\',\'Dish\', \'Disw\'] )) ]<br><br>testingData[""laundry""] = [any(laundry in feature for laundry in Washer) for feature in testingData[\'features\']]",No,4,8.0
"X = testingData[[\'bathrooms\', \'bedrooms\', \'latitude\', \'longitude\', \'price\', \'numPics\', \'has_ac\', \'included_offer\', \'concierge\', \'laundry\']]<br><br>predictions = regr.predict_proba(X)<br><br>output = pd.DataFrame(testingData[\'listing_id\'], columns = [\'listing_id\'])<br><br>output[\'high\'] = predictions[:,0]<br>output[\'low\'] = predictions[:,1]<br>output[\'medium\'] = predictions[:,2]<br><br><br>output.to_csv(""submission1.csv"", index=False)",Yes,3,48.0
"# objective is to predict a number of listing enquiries based on features
train = pd.read_json(""../input/train.json"", ""r"")
test = pd.read_json(""../input/test.json"", ""r"")
sample_sub = pd.read_csv(""../input/sample_submission.csv"")",Yes,4,44.0
"sample_sub.head()
# the above is what our submission is supposed to look like",No,5,41.0
from sklearn.naive_bayes import GaussianNB,No,5,22.0
gnb = GaussianNB(),No,5,4.0
"train.index = train['listing_id']
train = train.drop('interest_level', 1)
model = gnb.fit(train, train_target)",Yes,3,10.0
y = model.predict_proba(test),No,5,48.0
y_dat = pd.DataFrame(y),No,5,12.0
"#y_dat.copy(deep = False)
y_dat.loc[:,'listing_id'] = test.index",No,5,8.0
"y_dat.rename(columns = {'0':'medium', '1':'low', '2':'high'}, inplace = True)",No,5,61.0
data.head(),No,5,41.0
"#medium, low, high<br>#writer = pd.ExcelWriter(\'/Users/reshmasekar/Desktop/sub.xlsx\', engine=\'xlsxwriter\')<br># Convert the dataframe to an XlsxWriter Excel object.<br>data.to_csv(""sub_rf_4.csv"", index = False)<br>#y_dat.to_excel(""/Users/reshmasekar/Desktop"")",No,5,25.0
train.head(),No,5,41.0
"from sklearn import neural_network

#regr = neural_network.MLPClassifier(hidden_layer_sizes = (50,50,10))

#regr.fit(X_train, y_train)",No,5,22.0
"from sklearn.cluster import KMeans

regr = RandomForestClassifier(n_estimators=300, random_state=52)

regr.fit(X_train, y_train)

pred = regr.predict(X_test)

print(confusion_matrix(pred, y_test))

print(accuracy_score(pred, y_test))

print(labelEncoder.classes_)",Yes,3,4.0
"import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from os.path import pardir, curdir, join
",No,5,22.0
"df_train = pd.read_csv(""../input/train.csv"")
df_store = pd.read_csv(""../input/store.csv"")
df_test = pd.read_csv(""../input/test.csv"")
",No,5,45.0
"df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7]))
df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7]))
",No,5,16.0
"df_store['CompetitionDistance'] == df_store['CompetitionDistance'].apply(lambda x: np.log(x))
",No,5,8.0
"df_test = df_test.fillna(df_test.mean())
",No,5,17.0
"closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values
df_test = df_test[df_test[""Open""] != 0]
",No,5,14.0
"df_test = df_test.drop(['Date', 'StateHoliday'], axis=1)
",No,5,10.0
"from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
",No,5,22.0
"df = pd.read_json(open(""../input/train.json"", ""r""))

print(df.shape)
",Yes,4,44.0
"df.head()
",No,5,41.0
"print(df.shape)
",No,5,58.0
"df[""num_photos""] = df[""photos""].apply(len)
df[""num_features""] = df[""features""].apply(len)
df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df[""created""] = pd.to_datetime(df[""created""])
df[""created_year""] = df[""created""].dt.year
df[""created_month""] = df[""created""].dt.month
df[""created_day""] = df[""created""].dt.day
",No,5,8.0
"num_feats = [""bathrooms"", ""bedrooms"", ""price""]
X = df[num_feats]
y = df[""interest_level""]
X.head()
",Yes,4,21.0
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)
",No,5,13.0
"clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)
",Yes,3,4.0
"# Save the test IDs for Kaggle submission
test_ids = act_test['activity_id']

def preprocess_acts(data,min_date, train_set=True):
    
    # Getting rid of data feature for now
    dates=data['date']
    dates=process_dates(dates,min_date)
    data = data.drop(['date', 'activity_id'], axis=1)
    if(train_set):
        data = data.drop(['outcome'], axis=1)
    
    ## Split off _ from people_id
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)
    
    columns = list(data.columns)
    
    # Convert strings to ints
    for col in columns[1:]:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)
#    for column in columns[1:]:
#        dummies = pd.get_dummies(data[column])
#        data[dummies.columns] = dummies
    data['dates']=dates
    return data

def preprocess_people(data,min_date):
    dates=data['date']
    dates=process_dates(dates,min_date)
    # TODO refactor this duplication
    data = data.drop(['date'], axis=1)
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)
    
    #  Values in the people df is Booleans and Strings    
    columns = list(data.columns)
    bools = columns[11:]
    strings = columns[1:11]
    
    for col in bools:
        data[col] = pd.to_numeric(data[col]).astype(int)        
    for col in strings:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)
    #data = data.drop(['group_1'], axis=1)
#    for column in strings:
#        dummies = pd.get_dummies(data[column])
#        data[dummies.columns] = dummies
    data['dates']=dates
    return data",Yes,2,8.0
"#find minimum date
min_date=pd.concat([people['date'],act_train['date'],act_test['date']]).min()
min_date",No,3,11.0
"# Preprocess each df
min_date=pd.concat([people['date'],act_train['date'],act_test['date']]).min()
peeps = preprocess_people(people,min_date)
actions_train = preprocess_acts(act_train,min_date,train_set=True)
actions_test = preprocess_acts(act_test,min_date,train_set=False)
print (peeps.columns)
print (actions_train.columns)
peeps.sample(10)",Yes,4,71.0
actions_train.sample(10),No,5,41.0
"# Merege into a unified table

# Training 
features = actions_train.merge(peeps, how='left', on='people_id')
features=features.drop(['people_id'],axis=1)
labels = act_train['outcome']

# Testing
test = actions_test.merge(peeps, how='left', on='people_id')
test=test.drop(['people_id'],axis=1)
# Check it out...
features.sample(10)",No,4,32.0
"columnss=list(features.columns)
columnss
#features['group_1'].nunique()",No,5,71.0
"## Split Training Data
from sklearn.cross_validation import train_test_split

num_test = 0.10
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=num_test, random_state=23)

## Out of box random forest
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
#from sklearn.grid_search import GridSearchCV
#clf=GradientBoostingClassifier()
clf = RandomForestClassifier()
clf.fit(X_train, y_train)",Yes,4,7.0
"## Training Predictions
proba = clf.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print(""Area under ROC {0}"".format(score))",No,4,49.0
"sub = pd.DataFrame()
sub[""listing_id""] = df[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv(""submission_ky.csv"", index=False)
",Yes,3,12.0
"df[num_feats]
",No,5,41.0
"# objective is to predict a number of listing enquiries based on features
train = pd.read_json(""../input/train.json"", ""r"")
test = pd.read_json(""../input/test.json"", ""r"")
sample_sub = pd.read_csv(""../input/sample_submission.csv"")
",Yes,4,44.0
"train[""num_photos""] = train[""photos""].apply(len)
train[""num_features""] = train[""features""].apply(len)
train[""num_description_words""] = train[""description""].apply(lambda x: len(x.split("" "")))
train[""created""] = pd.to_datetime(train[""created""])
train[""created_year""] = train[""created""].dt.year
train[""created_month""] = train[""created""].dt.month
train[""created_day""] = train[""created""].dt.day
",No,5,8.0
"test[""num_photos""] = test[""photos""].apply(len)
test[""num_features""] = test[""features""].apply(len)
test[""num_description_words""] = test[""description""].apply(lambda x: len(x.split("" "")))
test[""created""] = pd.to_datetime(test[""created""])
test[""created_year""] = test[""created""].dt.year
test[""created_month""] = test[""created""].dt.month
test[""created_day""] = test[""created""].dt.day
",No,5,8.0
"from sklearn.linear_model import LogisticRegression
",No,5,22.0
"lr = LogisticRegression()
",No,5,4.0
"y = model.predict_proba(test)
",No,5,48.0
"y_dat = pd.DataFrame(y)
",No,5,12.0
"#y_dat.copy(deep = False)
y_dat.loc[:,'listing_id'] = test.index
",No,5,8.0
"y_dat.rename(columns = {'0':'medium', '1':'low', '2':'high'}, inplace = True)
",No,5,61.0
"data.head()
",No,5,41.0
"# Convert the dataframe to a CSV
data.to_csv(""sub_ky_3.csv"", index = False)
",No,5,25.0
"import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

%matplotlib inline",No,4,22.0
"df_stores = pd.read_csv('../input/store.csv', sep=',')
df_data = pd.read_csv('../input/train.csv', sep=',')
df_test = pd.read_csv('../input/test.csv', sep=',')",No,5,45.0
"print(df_stores.shape)
print(df_data.shape)
print(df_test.shape)",No,5,58.0
"#      <br>df_stores = df_stores.drop('CompetitionOpenSinceMonth', axis=1).drop('CompetitionOpenSinceYear', axis=1).drop('Promo2SinceWeek', axis=1).drop('Promo2SinceYear', axis=1)",No,5,10.0
df_stores.head(),No,5,41.0
df_data.head(),No,5,41.0
df_test.head(),No,5,41.0
"#  ,   <br>closed_stores = df_test[""Id""][df_test[""Open""] == 0].values<br>df_test = df_test[df_test[""Open""] != 0]<br>df_data = df_data[df_data[""Open""] != 0]",No,5,14.0
"#    ,     <br>df_data = df_data.drop('Open', axis=1).drop('Customers', axis=1)<br>df_test = df_test.drop('Open', axis=1)",No,5,10.0
"#     <br>df_data['Month'] = df_data['Date'].apply(lambda x: int(x[5:7]))<br>df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7]))<br>df_data['Year'] = df_data['Date'].apply(lambda x: int(x[:4]))<br>df_test['Year'] = df_test['Date'].apply(lambda x: int(x[:4]))<br>df_test = df_test.drop('Date', axis=1)<br>df_data = df_data.drop('Date', axis=1)",Yes,4,8.0
"#       -  ,<br>#        . .,   .<br>df_stores = df_stores.join(df_data.groupby('Store')['Sales'].mean(), on='Store').rename(columns={'Sales': 'Av_sales'})",Yes,4,61.0
#  <br>max_s = df_stores['Av_sales'].max()<br>df_stores['Av_sales']=df_stores['Av_sales']/max_s,No,5,8.0
"df_test = pd.merge(df_test, df_stores, left_index=True, on='Store')
df_data = pd.merge(df_data, df_stores, left_index=True, on='Store')",No,5,32.0
"df_data['CompetitionDistance'].fillna(df_data['CompetitionDistance'].median(), inplace = True)
df_test['CompetitionDistance'].fillna(df_data['CompetitionDistance'].median(), inplace = True)",No,5,17.0
"max_dist = df_data['CompetitionDistance'].max()
df_data['CompetitionDistance']=df_data['CompetitionDistance']/max_dist
df_test['CompetitionDistance']=df_test['CompetitionDistance']/max_dist",No,5,8.0
"df_data['PromoInterval'].fillna('n', inplace = True)
df_test['PromoInterval'].fillna('n', inplace = True)",No,5,17.0
"#   ,   0  1, , <br>#            <br>print('Days', df_data['DayOfWeek'].unique())<br>print ('Month', df_data['Month'].unique())<br>print ('Promo', df_data['Promo'].unique())<br>print ('StateHoliday', df_data['StateHoliday'].unique())<br>print ('SchoolHoliday', df_data['SchoolHoliday'].unique())<br>print ('StoreType', df_data['StoreType'].unique())<br>print ('Assortment', df_data['Assortment'].unique())<br>print ('PromoInterval', df_data['PromoInterval'].unique())<br>print ('Year', df_data['Year'].unique())",No,5,57.0
"df_data[""StateHoliday""].loc[df_data[""StateHoliday""] == 0] = ""0""
df_test[""StateHoliday""].loc[df_test[""StateHoliday""] == 0] = ""0""",No,5,14.0
"print('StateHoliday', df_data['StateHoliday'].unique())",No,5,57.0
#    <br>df_data = df_data[df_data['Sales']!=0],No,5,14.0
df_data = df_data[df_data['Sales']<34500],No,5,14.0
df_data.shape,No,5,58.0
"df_data = df_data.drop('Store', axis=1)
df_test = df_test.drop('Store', axis=1)",No,5,10.0
"from sklearn.model_selection import train_test_split

df_train, df_mytest = train_test_split(df_data, test_size = 0.2)",No,5,13.0
"X_train, y_train = df_train.drop('Sales', axis=1).values, df_train['Sales'].values",No,5,21.0
"from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs=4)",No,5,4.0
"model.fit(X_train, y_train)",No,5,7.0
"# Test Set Predictions
test_proba = clf.predict_proba(test)
test_preds = test_proba[:,1]
test_res=clf.predict(test)

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output1 = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_res })
output.head()",Yes,4,55.0
"output.to_csv('redhat.csv', index = False)
output1.to_csv('redhat_noprpba.csv', index = False)",No,5,25.0
"import os
import sys
import operator
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.

import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
",Yes,3,8.0
"#input data<br>train_df=pd.read_json(\'../input/train.json\')<br>test_df=pd.read_json(\'../input/test.json\')<br>train_df.head()<br>#removing outliers<br><br>test_df[""bathrooms""].loc[19671] = 1.5<br>test_df[""bathrooms""].loc[22977] = 2.0<br>test_df[""bathrooms""].loc[63719] = 2.0<br><br>ulimit = np.percentile(train_df.price.values, 99)<br>train_df[\'price\'].ix[train_df[\'price\']>ulimit] = ulimit<br><br>ulimit = np.percentile(test_df.price.values, 99)<br>test_df[\'price\'].ix[test_df[\'price\']>ulimit] = ulimit<br><br>train_df[""logprice""] = np.log(train_df[""price""])<br>test_df[""logprice""] = np.log(test_df[""price""])<br><br># count of ""photos""<br>train_df[""num_photos""] = train_df[""photos""].apply(len)<br>test_df[""num_photos""] = test_df[""photos""].apply(len)<br><br>train_df[""num_features""] = train_df[""features""].apply(len)<br>test_df[""num_features""] = test_df[""features""].apply(len)<br><br>train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))<br>test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))<br><br><br><br>train_df[""pos""] = train_df.longitude.round(3).astype(str) + \'_\' + train_df.latitude.round(3).astype(str)<br>test_df[""pos""] = test_df.longitude.round(3).astype(str) + \'_\' + test_df.latitude.round(3).astype(str)<br><br>vals = train_df[\'pos\'].value_counts()<br>dvals = vals.to_dict()<br>train_df[""density""] = train_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br>test_df[""density""] = test_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br><br><br>#basic features<br>train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""]<br>test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""] <br>train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""] <br>test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""] <br><br><br>train_df[\'price_per_room\'] = train_df[\'price\']/train_df[\'room_sum\']<br>test_df[\'price_per_room\'] = test_df[\'price\']/test_df[\'room_sum\']<br><br>image_date = pd.read_csv(\'../input/listing_image_time.csv\')<br><br># rename columns so you can join tables later on<br>image_date.columns = [""listing_id"", ""time_stamp""]<br><br># reassign the only one timestamp from April, all others from Oct/Nov<br>image_date.loc[80240,""time_stamp""] = 1478129766 <br><br>image_date[""img_date""]                  = pd.to_datetime(image_date[""time_stamp""], unit=""s"")<br>image_date[""img_days_passed""]           = (image_date[""img_date""].max() - image_date[""img_date""]).astype(""timedelta64[D]"").astype(int)<br>image_date[""img_date_month""]            = image_date[""img_date""].dt.month<br>image_date[""img_date_week""]             = image_date[""img_date""].dt.week<br>image_date[""img_date_day""]              = image_date[""img_date""].dt.day<br>image_date[""img_date_dayofweek""]        = image_date[""img_date""].dt.dayofweek<br>image_date[""img_date_dayofyear""]        = image_date[""img_date""].dt.dayofyear<br>image_date[""img_date_hour""]             = image_date[""img_date""].dt.hour<br>image_date[""img_date_monthBeginMidEnd""] = image_date[""img_date_day""].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)<br><br>train_df = pd.merge(train_df, image_date, on=""listing_id"", how=""left"")<br>test_df = pd.merge(test_df, image_date, on=""listing_id"", how=""left"")<br><br>features_to_use=[""bathrooms"", ""bedrooms"", ""price_t"",""room_sum"",""latitude"",""longitude"",""num_photos"",""density"",""logprice"",""num_features"",""num_description_words"",""price_per_room"",""listing_id"",""img_date""]<br>print(train_df[\'price\'].head())<br>",Yes,3,8.0
"def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=818):<br>    param = {}<br>    param[\'objective\'] = \'multi:softprob\'<br>    param[\'eta\'] = 0.03<br>    param[\'max_depth\'] = 6<br>    param[\'silent\'] = 1<br>    param[\'num_class\'] = 3<br>    param[\'eval_metric\'] = ""mlogloss""<br>    param[\'min_child_weight\'] = 1<br>    param[\'subsample\'] = 0.7<br>    param[\'colsample_bytree\'] = 0.7<br>    param[\'seed\'] = seed_val<br>    num_rounds = num_rounds<br><br>    plst = list(param.items())<br>    xgtrain = xgb.DMatrix(train_X, label=train_y)<br><br>    if test_y is not None:<br>        xgtest = xgb.DMatrix(test_X, label=test_y)<br>        watchlist = [ (xgtrain,\'train\'), (xgtest, \'test\') ]<br>        model = xgb.train(plst, xgtrain, num_rounds, watchlist)<br>    else:<br>        xgtest = xgb.DMatrix(test_X)<br>        model = xgb.train(plst, xgtrain, num_rounds)<br><br>    pred_test_y = model.predict(xgtest)<br>    return pred_test_y, model<br>",No,3,4.0
"index=list(range(train_df.shape[0]))<br>\t<br>random.shuffle(index)<br>a=[np.nan]*len(train_df)<br>b=[np.nan]*len(train_df)<br>c=[np.nan]*len(train_df)<br><br>for i in range(5):<br>    building_level={}<br>    for j in train_df['manager_id'].values:<br>        building_level[j]=[0,0,0]<br>    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]<br>    train_index=list(set(index).difference(test_index))<br>    for j in train_index:<br>        temp=train_df.iloc[j]<br>        if temp['interest_level']=='low':<br>            building_level[temp['manager_id']][0]+=1<br>        if temp['interest_level']=='medium':<br>            building_level[temp['manager_id']][1]+=1<br>        if temp['interest_level']=='high':<br>            building_level[temp['manager_id']][2]+=1<br>    for j in test_index:<br>        temp=train_df.iloc[j]<br>        if sum(building_level[temp['manager_id']])!=0:<br>            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])<br>            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])<br>            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])<br>   <br> <br>train_df['manager_level_low']=a<br>train_df['manager_level_medium']=b<br>train_df['manager_level_high']=c<br>",No,4,20.0
"a=[]<br>b=[]<br>c=[]<br>building_level={}<br>for j in train_df[\'manager_id\'].values:<br>    building_level[j]=[0,0,0]<br>for j in range(train_df.shape[0]):<br>    temp=train_df.iloc[j]<br>    if temp[\'interest_level\']==\'low\':<br>        building_level[temp[\'manager_id\']][0]+=1<br>    if temp[\'interest_level\']==\'medium\':<br>        building_level[temp[\'manager_id\']][1]+=1<br>    if temp[\'interest_level\']==\'high\':<br>        building_level[temp[\'manager_id\']][2]+=1<br><br>for i in test_df[\'manager_id\'].values:<br>    if i not in building_level.keys():<br>        a.append(np.nan)<br>        b.append(np.nan)<br>        c.append(np.nan)<br>    else:<br>        a.append(building_level[i][0]*1.0/sum(building_level[i]))<br>        b.append(building_level[i][1]*1.0/sum(building_level[i]))<br>        c.append(building_level[i][2]*1.0/sum(building_level[i]))<br>test_df[\'manager_level_low\']=a<br>test_df[\'manager_level_medium\']=b<br>test_df[\'manager_level_high\']=c<br><br><br>features_to_use.append(\'manager_level_low\') <br>features_to_use.append(\'manager_level_medium\') <br>features_to_use.append(\'manager_level_high\')<br>categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]<br>",No,4,20.0
for f in categorical:<br>\tif train_df[f].dtype=='object':<br>\t\t#print(f)<br>\t\tlbl = preprocessing.LabelEncoder()<br>\t\tlbl.fit(list(train_df[f].values) + list(test_df[f].values))<br>\t\ttrain_df[f] = lbl.transform(list(train_df[f].values))<br>\t\ttest_df[f] = lbl.transform(list(test_df[f].values))<br>\t\tfeatures_to_use.append(f)<br>,No,4,20.0
"train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>",No,5,78.0
"def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2
",No,5,20.0
"for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    train_df,test_df = factorize(train_df, test_df,col)
",No,5,20.0
"X_test, y_mytest = df_mytest.drop('Sales', axis=1).values, df_mytest['Sales'].values",No,5,21.0
y_hat = model.predict(X_test),No,5,48.0
from sklearn.linear_model import Ridge,No,5,22.0
"R_model = Ridge(alpha=1)
R_model.fit(X_train, y_train)",Yes,4,4.0
"import os
import sys
import operator
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.

import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer",No,5,88.0
"#input data<br>train_df=pd.read_json(\'../input/train.json\')<br>test_df=pd.read_json(\'../input/test.json\')<br>train_df.head()<br>#removing outliers<br><br>test_df[""bathrooms""].loc[19671] = 1.5<br>test_df[""bathrooms""].loc[22977] = 2.0<br>test_df[""bathrooms""].loc[63719] = 2.0<br><br>ulimit = np.percentile(train_df.price.values, 99)<br>train_df[\'price\'].ix[train_df[\'price\']>ulimit] = ulimit<br><br>ulimit = np.percentile(test_df.price.values, 99)<br>test_df[\'price\'].ix[test_df[\'price\']>ulimit] = ulimit<br><br>train_df[""logprice""] = np.log(train_df[""price""])<br>test_df[""logprice""] = np.log(test_df[""price""])<br><br># count of ""photos""<br>train_df[""num_photos""] = train_df[""photos""].apply(len)<br>test_df[""num_photos""] = test_df[""photos""].apply(len)<br><br>train_df[""num_features""] = train_df[""features""].apply(len)<br>test_df[""num_features""] = test_df[""features""].apply(len)<br><br>train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))<br>test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))<br><br><br><br>train_df[""pos""] = train_df.longitude.round(3).astype(str) + \'_\' + train_df.latitude.round(3).astype(str)<br>test_df[""pos""] = test_df.longitude.round(3).astype(str) + \'_\' + test_df.latitude.round(3).astype(str)<br><br>vals = train_df[\'pos\'].value_counts()<br>dvals = vals.to_dict()<br>train_df[""density""] = train_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br>test_df[""density""] = test_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))<br><br><br>#basic features<br>train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""]<br>test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""] <br>train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""] <br>test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""] <br><br><br>train_df[\'price_per_room\'] = train_df[\'price\']/train_df[\'room_sum\']<br>test_df[\'price_per_room\'] = test_df[\'price\']/test_df[\'room_sum\']<br><br><br>features_to_use=[""bathrooms"", ""bedrooms"", ""price_t"",""room_sum"",""latitude"",""longitude"",""num_photos"",""density"",""logprice"",""num_features"",""num_description_words"",""price_per_room"",""listing_id""]<br>print(train_df[\'price\'].head())",Yes,4,8.0
for f in categorical:<br>\tif train_df[f].dtype=='object':<br>\t\t#print(f)<br>\t\tlbl = preprocessing.LabelEncoder()<br>\t\tlbl.fit(list(train_df[f].values) + list(test_df[f].values))<br>\t\ttrain_df[f] = lbl.transform(list(train_df[f].values))<br>\t\ttest_df[f] = lbl.transform(list(test_df[f].values))<br>\t\tfeatures_to_use.append(f),No,5,20.0
"train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))<br>test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))",No,5,78.0
print(train_df[features_to_use].head()),No,5,41.0
"train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()",No,5,11.0
"test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()",No,5,11.0
"target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)",Yes,4,21.0
"preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)
out_df = pd.DataFrame(preds)
out_df.columns = [""high"", ""medium"", ""low""]
out_df[""listing_id""] = test_df.listing_id.values
out_df.to_csv(""xgb_starter2.csv"", index=False)",Yes,3,48.0
"data = pd.read_csv('../input/train.csv', sep=',', low_memory=False)
data['StateHoliday'] = data['StateHoliday'].apply(lambda x: str(x))",Yes,3,45.0
"grouppedByStoreDayPromo = data[data['Sales'] > 0].groupby(by=['Store', 'DayOfWeek', 'Promo'])",No,5,60.0
"test = pd.read_csv('../input/test.csv', sep=',')",No,5,45.0
"mn = data[data['Sales'] > 0].groupby(['Store', 'DayOfWeek', 'Promo'])['Sales'].mean().reset_index()",No,5,60.0
"res[['Id', 'Sales']].to_csv('result_mean.csv', sep=',', index=None)",No,5,25.0
"import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.dates 
import datetime 
%matplotlib inline",No,5,23.0
"df_train = pd.read_csv(""../input/train.csv"")
df_store = pd.read_csv(""../input/store.csv"")
df_test = pd.read_csv(""../input/test.csv"")
df_train.head()",No,5,45.0
"closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values
closed_store_ids",No,4,41.0
"df_train['Year'] = df_train['Date'].apply(lambda x: int(x[:4]))
df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7]))
df_train['Day'] = df_train['Date'].apply(lambda x: int(x[8:]))",No,5,8.0
"fig, (axis1) = plt.subplots(1,1,figsize=(15,4))
sns.countplot(x = 'Open', hue = 'DayOfWeek', data = df_train,)",No,5,33.0
"for temp_year in range (2013,2016):<br>    df_train1_temp = df_train[df_train.Year == temp_year]<br>    average_daily_sales = df_train1_temp.groupby(\'Date\')[""Sales""].mean()<br>    fig = plt.subplots(1,1,sharex=True,figsize=(25,8))<br>    average_daily_sales.plot(title=""Average Daily Sales"")",No,5,33.0
"average_monthly_sales = df_train.groupby(\'Month\')[""Sales""].mean()<br>fig = plt.subplots(1,1,sharex=True,figsize=(10,5))<br>average_monthly_sales.plot(legend=True,marker=\'o\',title=""Average Sales"")",No,5,33.0
df_train.StateHoliday.unique(),No,5,57.0
"df_train['StateHoliday'] = df_train['StateHoliday'].replace(0, '0')
df_train.StateHoliday.unique()",Yes,3,16.0
"sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = df_train,
                   size = 5, kind =""box"", palette =""muted"")
sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = df_train,
                   size = 5, kind =""box"", palette =""muted"")
sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = df_train,
                   size = 5, kind =""box"", palette =""muted"")",No,5,75.0
"sns.factorplot(x =""Year"", y =""Sales"", hue =""StateHoliday"", data = df_train, 
               size = 6, kind =""bar"", palette =""muted"")",No,5,75.0
"sns.factorplot(x =""Month"", y =""Sales"", hue =""HolidayBin"", data = df_train, 
               size = 6, kind =""bar"", palette =""muted"")",No,5,75.0
"sns.factorplot(x=""DayOfWeek"", y=""Customers"", hue=""HolidayBin"", col=""Promo"", data=df_train,
                   capsize=.2, palette=""YlGnBu_d"", size=6, aspect=.75)",No,5,75.0
"sns.factorplot(x=""DayOfWeek"", y=""Customers"", hue=""SchoolHoliday"", col=""Promo"", data=df_train,
                   capsize=.2, palette=""YlGnBu_d"", size=6, aspect=.75)",No,5,75.0
sns.distplot(df_train.Sales),No,5,33.0
df_store.head(),No,5,41.0
"total_sales_customers =  df_train.groupby('Store')['Sales', 'Customers'].sum()
total_sales_customers.head()",Yes,3,60.0
"df_total_sales_customers = pd.DataFrame({'Sales':  total_sales_customers['Sales'],
                                         'Customers': total_sales_customers['Customers']}, 
                                         index = total_sales_customers.index)

df_total_sales_customers = df_total_sales_customers.reset_index()
df_total_sales_customers.head()",Yes,3,12.0
"avg_sales_customers =  df_train.groupby('Store')['Sales', 'Customers'].mean()
avg_sales_customers.head()",Yes,4,60.0
"df_avg_sales_customers = pd.DataFrame({'Sales':  avg_sales_customers['Sales'],
                                         'Customers': avg_sales_customers['Customers']}, 
                                         index = avg_sales_customers.index)

df_avg_sales_customers = df_avg_sales_customers.reset_index()

df_stores_avg = df_avg_sales_customers.join(df_store.set_index('Store'), on='Store')
df_stores_avg.head()",Yes,3,12.0
"df_stores_new = df_total_sales_customers.join(df_store.set_index('Store'), on='Store')
df_stores_new.head()",Yes,4,32.0
"average_storetype = df_stores_new.groupby('StoreType')['Sales', 'Customers', 'CompetitionDistance'].mean()

fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,4))
sns.barplot(average_storetype.index, average_storetype['Sales'], ax=axis1)
sns.barplot(average_storetype.index, average_storetype['Customers'], ax=axis2)
sns.barplot(average_storetype.index, average_storetype['CompetitionDistance'], ax=axis3)

average_storetype.index",Yes,4,33.0
"average_assortment = df_stores_new.groupby('Assortment')['Sales', 'Customers'].mean()

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(average_assortment.index, average_assortment['Sales'], ax=axis1)
sns.barplot(average_assortment.index, average_assortment['Customers'], ax=axis2)",Yes,4,33.0
"df_test[\'Year\'] = df_test[\'Date\'].apply(lambda x: int(x[:4]))<br>df_test[\'Month\'] = df_test[\'Date\'].apply(lambda x: int(x[5:7]))<br>df_test[\'Day\'] = df_test[\'Date\'].apply(lambda x: int(x[8:]))<br>df_test[""HolidayBin""] = df_test.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})<br>del df_test[\'Date\']<br>del df_test[\'StateHoliday\']<br>df_test.head()",Yes,3,16.0
df_train.head(),No,5,41.0
"from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

train_stores = dict(list(df_train.groupby('Store')))
test_stores = dict(list(df_test.groupby('Store')))",Yes,4,22.0
"best_list_max_depth = []<br>best_list_n_estimators = []<br><br>for i in test_stores:<br>    store = train_stores[i]<br>    X_train = store.drop([""Sales"", ""Store"", ""Customers""],axis=1)<br>    Y_train = store[""Sales""]<br>    X_test  = test_stores[i].copy()<br><br>    <br>    store_ids = X_test[""Id""]<br>    X_test.drop([""Id"",""Store""], axis=1,inplace=True)<br>    X_train = X_train.fillna(X_train.mean())<br>    X_test = X_test.fillna(X_train.mean())<br>    <br>    estimator = RandomForestRegressor(random_state=123, criterion = \'mse\')<br>    params = {\'n_estimators\': range(5, 20), \'max_depth\': range(5, 25)}<br>    grid = GridSearchCV(estimator, params).fit(X_train, Y_train)<br>    best_list_max_depth.append(grid.best_params_[\'max_depth\'])<br>    best_list_n_estimators.append(grid.best_params_[\'n_estimators\'])<br>    print (""score"", grid.best_score_)<br>    print (""params"", grid.best_params_)",Yes,4,2.0
"res_max_depth = round(np.array(best_list_max_depth).mean())
res_n_estimators = round(np.array(best_list_n_estimators).mean())",No,5,2.0
"best_max_depth = round(np.array(best_list_max_depth).mean())
best_n_estimators = round(np.array(best_list_n_estimators).mean())
print(best_max_depth)
print(best_n_estimators)",No,5,2.0
"import time
import datetime
import matplotlib.pyplot as plt

%matplotlib inline
",No,5,23.0
"trainData = pd.read_csv('../input/train.csv', low_memory=False)
trainData
",No,5,45.0
"supply = pd.read_csv('../input/store.csv')
supply.set_index('Store', inplace=True)
supply
",No,5,45.0
"trainData
",No,5,41.0
"#:   2   .     customers,    sales<br>from sklearn.tree import DecisionTreeRegressor<br>from sklearn.ensemble import RandomForestRegressor<br>from sklearn.linear_model import LinearRegression<br>from sklearn.model_selection import GridSearchCV<br>from sklearn.model_selection import RandomizedSearchCV<br>from sklearn.model_selection import cross_val_score<br>from sklearn.model_selection import validation_curve<br>",No,5,22.0
"X_train, customers_train, sales_train = trainData.drop(['Sales', 'Customers'], axis=1).values, trainData['Customers'].values, trainData['Sales'].values
",No,5,21.0
"X1_train = np.concatenate((X_train, np.reshape(customers_train, (customers_train.shape[0], 1))), axis=1)
",No,5,11.0
"forest1 = RandomForestRegressor()
forest2 = RandomForestRegressor()
forest1.fit(X_train, customers_train)
forest2.fit(X1_train, sales_train)
",Yes,3,7.0
#        <br>testData = pd.read_csv('../input/test.csv')<br>,No,5,45.0
"testData['UnixTime'] = testData['Date'].map(toUnixTime)
testData.drop('Date', axis=1, inplace=True)
",Yes,3,8.0
"testData.StateHoliday.replace({'a': 1, 'b': 2, 'c': 3, '0': 0}, inplace=True)
unixTime = testData['UnixTime'].values
testData['UnixTime'] = (unixTime - np.mean(unixTime))/np.std(unixTime)
",No,4,18.0
"testData
",No,5,41.0
"nanToMedian(testData)
X_test = testData.drop('Id', axis=1).values
",Yes,3,10.0
"customers_test = forest1.predict(X_test)
X1_test = np.concatenate((X_test, np.reshape(customers_test, (customers_test.shape[0], 1))), axis=1)
sales_predicted = forest2.predict(X1_test)
",Yes,3,32.0
"Result = pd.DataFrame(testData.Id)
Result['Sales'] = sales_predicted
Result.set_index('Id')
Result.to_csv('./result.csv', index=False)
",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline",No,5,23.0
"df_store = pd.read_csv('../input/store.csv', index_col=None)",No,5,45.0
df_store.info(),No,5,40.0
"df_train = pd.read_csv('../input/train.csv', index_col=None, low_memory=False)",No,5,45.0
"y_hat_R = R_model.predict(X_test)
print(eval_rmspe(y_hat_R, y_mytest))",Yes,4,48.0
"print(df_train.shape)
df_train.head()",Yes,3,58.0
"#,    <br>from sklearn.ensemble import RandomForestRegressor",No,5,22.0
df_train.info(),No,5,40.0
"df_train.replace({'StateHoliday': {0: '0'}}, inplace=True)
df_train.StateHoliday.unique()",Yes,3,57.0
df_train.DayOfWeek = df_train.DayOfWeek.astype(str)  # for dictvectorizer,No,5,16.0
"#    <br>X_train_full, y_train_full = df_data.drop('Sales', axis=1).values, df_data['Sales'].values",No,5,21.0
"X_test_full = df_test.drop('Id', axis=1).values",Yes,4,10.0
"# ,        <br>forest = RandomForestRegressor(n_jobs=1, n_estimators=150, max_features=7, max_depth=100)<br>forest.fit(X_train_full, y_train_full)",Yes,4,4.0
y_hat_full = forest.predict(X_test_full),No,5,48.0
"submission = submission.append(pd.Series(y_hat_full, index=store_ids))<br>submission = pd.DataFrame({ ""Id"": submission.index, ""Sales"": submission.values})<br>submission.to_csv(\'submission.csv\', index=False)",Yes,3,12.0
"df = df_train[df_train.Open != 0].merge(df_store, on='Store').fillna(1)
df.drop(['Store', 'Date', 'Customers'], axis=1, inplace=True)",Yes,3,17.0
"df.shape, df.columns",Yes,3,58.0
y_train = df.Sales.values,No,5,21.0
"print(X_train.shape, y_train.shape)",No,5,58.0
"from sklearn.ensemble import RandomForestRegressor

rgr = RandomForestRegressor(n_estimators=25, verbose=True, n_jobs=8)
rgr.fit(X_train, y_train)
print(rgr.score(X_train, y_train))",Yes,3,4.0
"df_test = pd.read_csv('../input/test.csv', index_col=None)",No,5,45.0
"print(df_test.shape)
df_test.head()",Yes,3,58.0
"features_to_use  = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price""]",No,5,77.0
"df_test.replace({'StateHoliday': {0: '0'}}, inplace=True)
df_test.StateHoliday.unique()",Yes,3,57.0
"df_pred = df_test[df_test.Open != 0].merge(df_store, on='Store').fillna(1)
df_pred.drop(['Id', 'Store', 'Date'], axis=1, inplace=True)",Yes,3,17.0
df_pred.shape,No,5,58.0
X_test.shape,No,5,58.0
rgr.predict(X_test)[:10],No,5,48.0
"df_test.loc[df_test.Open != 0,'Sales'] = rgr.predict(X_test)
df_test.loc[df_test.Open == 0, 'Sales'] = 0",No,5,48.0
"out = pd.DataFrame({<br>    ""Id"": df_test.Id,<br>    ""Sales"": df_test.Sales.values<br>})<br>out.to_csv(\'submission.csv\', index=False)",No,5,25.0
"from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *",No,5,22.0
"paths = [""../input/train.json"", ""../input/test.json""]
target_name = ""interest_level""",No,5,77.0
"rd = Reader()
df = rd.train_test_split(paths, target_name)",No,5,13.0
"df[""train""].head()",No,5,41.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import os<br><br>import h2o<br>from h2o.estimators.random_forest import H2ORandomForestEstimator<br><br>import numpy as np<br>import pandas as pd<br><br>os.environ[\'http_proxy\'] = \'\'<br>os.environ[\'https_proxy\'] = \'\'<br>os.environ[\'NO_PROXY\'] = \'localhost\'<br><br>def transform(df_):<br>    """"""<br>    transform Date to datetime type.<br>    """"""<br>    df_[\'Date\']=pd.to_datetime(df_[\'Date\'])<br>    df_[\'month\']=df_[\'Date\'].dt.month<br>    df_[\'year\']=df_[\'Date\'].dt.year<br>    df_[\'Store\']=pd.to_numeric(df_[\'Store\'])<br>    return df_<br><br>store=pd.read_csv(""../input/store.csv"")<br>train=pd.read_csv(""../input/train.csv"")<br>test=pd.read_csv(""../input/test.csv"")<br># filter sales invalid column<br>train=train[train[\'Sales\'] > 0]<br># merge train and test with store<br>train=pd.merge(train, store, on=[\'Store\'])<br>test=pd.merge(test, store, on=[\'Store\'])<br># transform<br>train=transform(train)<br>test=transform(test)<br># train convert to logSales from log(Sales)<br>train[\'logSales\']=pd.to_numeric(np.log(train[\'Sales\']))<br>test[\'logSales\']=0<br><br># initialization of h2o<br>h2o.init(nthreads=-1, max_mem_size = ""8G"")<br>train_hf = h2o.H2OFrame(train)<br>test_hf = h2o.H2OFrame(test)<br>rf_v1_model = H2ORandomForestEstimator(model_id=""rf_covType_v1"", ntrees=200, stopping_rounds=2, max_depth = 30, nbins_cats = 1115, score_each_iteration=True, seed=1000000)<br><br># Training prepare for model<br>covtype_X=[col for col in train_hf.columns if col not in [""Id"",""Date"",""Sales"",""logSales"",""Customers""]]<br>covtype_y=train_hf.columns[-1]<br>rf_v1_model.train(x=covtype_X, y=covtype_y, training_frame=train_hf)<br>test_result_hf = rf_v1_model.predict(test_hf)<br>test_result_df = test_result_hf.as_data_frame()<br>test_result_df[\'predict\']=np.exp(test_result_df[\'predict\'])<br>test_result_df.rename(columns={\'predict\': \'Sales\'}, inplace=True)<br>test_result_df.insert(loc=0, column=\'Id\', value=test[\'Id\'])<br>test_result_df.set_index(\'Id\')<br>test_result_df.to_csv(\'python_h2o_rf.csv\', header=True, index=False)",Yes,4,22.0
"import numpy   as np
import pandas  as pd",No,5,22.0
"brats = pd.read_csv(""../input/child_wishlist_v2.csv"", header=None, index_col=0).as_matrix()
gifts = pd.read_csv(""../input/gift_goodkids_v2.csv"",  header=None, index_col=0).as_matrix()",No,5,45.0
"TRIPS_COUNT =  5_001
TWINS_COUNT = 40_000
TWINS_START = TRIPS_COUNT
TWINS_END   = TWINS_START + TWINS_COUNT
GIFTS_LIMIT =  1_000

BRAT_PREF_COUNT = brats.shape[1]
GIFT_PREF_COUNT = gifts.shape[1]",No,5,77.0
"submit = pd.read_csv(""save/""+target_name+""_predictions.csv"")[[""high"", ""medium"", ""low""]]
submit[""listing_id""] = df[""test""].listing_id.astype(int).values
submit.to_csv(""mlbox.csv"", index=False)",Yes,3,45.0
"print(""Quantity of Gifts:"")
pd.Series(gift_cnt).value_counts()",No,5,72.0
"import os, operator, math
import pandas as pd
import numpy as np
import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment
from collections import defaultdict, Counter",Yes,3,22.0
"scores = pd.DataFrame(rows)
scores.head()",Yes,4,12.0
"submission = pd.DataFrame({
    ""ChildId"": range(len(brats)),
    ""GiftId"":  assigned
})",No,5,12.0
"child_data = pd.read_csv('../input/santa-gift-matching/child_wishlist_v2.csv', 
                         header=None).drop(0, 1).values
gift_data = pd.read_csv('../input/santa-gift-matching/gift_goodkids_v2.csv', 
                        header=None).drop(0, 1).values

n_children = 1000000
n_gift_type = 1000 
n_gift_quantity = 1000
n_child_wish = 100
triplets = 5001
twins = 40000
tts = triplets + twins ",Yes,3,77.0
"submission.to_csv(""greedy_v2.csv"", index=False)",No,5,25.0
"import os, cv2, re, random
import numpy as np
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
from keras import layers, models, optimizers
from keras import backend as K
from sklearn.model_selection import train_test_split",No,5,22.0
"initial_sub = '../input/max-flow-with-min-cost-v2-0-9267/subm_0.926447635166.csv'
subm = pd.read_csv(initial_sub)
subm['gift_rank'] = subm.groupby('GiftId').rank() - 1
subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank']
subm['gift_id'] = subm['gift_id'].astype(np.int64)
current_gift_ids = subm['gift_id'].values",Yes,3,8.0
"import os, cv2, itertools
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline",No,5,23.0
"train_images_dogs_cats.sort(key=natural_keys)
train_images_dogs_cats = train_images_dogs_cats[0:1300] + train_images_dogs_cats[12500:13800] 

test_images_dogs_cats.sort(key=natural_keys)",Yes,4,9.0
"def prepare_data(list_of_images):<br>    """"""<br>    Returns two arrays: <br>        x is an array of resized images<br>        y is an array of labels<br>    """"""<br>    x = [] # images as arrays<br>    y = [] # labels<br>    <br>    for image in list_of_images:<br>        x.append(cv2.resize(cv2.imread(image), (img_width,img_height), interpolation=cv2.INTER_CUBIC))<br>    <br>    for i in list_of_images:<br>        if \'dog\' in i:<br>            y.append(1)<br>        elif \'cat\' in i:<br>            y.append(0)<br>        #else:<br>            #print(\'neither cat nor dog name present in images\')<br>            <br>    return x, y",No,5,21.0
"from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
clf = LogisticRegressionCV()
X_train_lr, y_train_lr = X_train.T, y_train.T.ravel()
clf.fit(X_train_lr, y_train_lr)",Yes,3,22.0
"X, Y = prepare_data(train_images_dogs_cats)
print(K.image_data_format())",Yes,4,21.0
"# First split the data in two sets, 80% for training, 20% for Val/Test)
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=1)",No,5,13.0
"submission = pd.DataFrame(np.hstack([test_idx, clf.predict_proba(X_test_lr)]), columns=['id', 'cat', 'dog'])",No,5,12.0
"nb_train_samples = len(X_train)
nb_validation_samples = len(X_val)
batch_size = 16",No,5,77.0
"model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), input_shape=(img_width, img_height, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(32, (3, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(64, (3, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()",No,5,4.0
"submission = submission.drop([\'cat\'], axis=1)<br>submission = submission.rename(index=str, columns={""dog"": ""label""})<br>submission[\'id\'] = submission[\'id\'].astype(int)<br>submission.sort_values(\'id\', inplace=True)",Yes,3,61.0
submission.head(),No,5,41.0
"submission.to_csv('STahamtan_Dog_vs_Cat_Submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import seaborn as sns<br>import matplotlib.pyplot as plt<br>%matplotlib inline<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"train = pd.read_json(""../input/train.json"")
test = pd.read_json(""../input/test.json"")",No,5,44.0
test.info(),No,5,40.0
"history = model.fit_generator(
    train_generator, 
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=30,
    validation_data=validation_generator,
    validation_steps=nb_validation_samples // batch_size
)
",No,4,7.0
"model.save_weights('model_wieghts.h5')
model.save('model_keras.h5')",Yes,4,50.0
train['interest_level'].value_counts(),No,5,72.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'bathrooms\', shade=True)<br>facet.add_legend()<br>plt.show()",No,5,33.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'bathrooms\', shade=True)<br>facet.set(xlim=(0,2))<br>facet.add_legend()<br>plt.show()",No,5,33.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'bathrooms\', shade=True)<br>facet.set(xlim=(2,6))<br>facet.add_legend()<br>plt.show()",No,5,33.0
"for dataset in train_test:
    dataset.loc[ dataset['bathrooms'] <= 2, 'bathrooms'] = 2,
    dataset.loc[(dataset['bathrooms'] > 2) & (dataset['bathrooms'] <= 4), 'bathrooms'] = 1,
    dataset.loc[ dataset['bathrooms'] > 4, 'bathrooms'] = 0
",No,5,8.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'bedrooms\', shade=True)<br>facet.add_legend()<br>plt.show()",No,5,33.0
"for dataset in train_test:
    dataset.loc[ dataset['bedrooms'] <= 2, 'bedrooms'] = 0,
    dataset.loc[(dataset['bedrooms'] > 2) & (dataset['bedrooms'] <= 4), 'bedrooms'] = 1,
    dataset.loc[ dataset['bedrooms'] > 4, 'bedrooms'] = 2",No,5,8.0
"counter = range(1, len(test_images_dogs_cats) + 1)<br>solution = pd.DataFrame({""id"": counter, ""label"":list(prediction_probabilities)})<br>cols = [\'label\']<br><br>for col in cols:<br>    solution[col] = solution[col].map(lambda x: str(x).lstrip(\'[\').rstrip(\']\')).astype(float)<br><br>solution.to_csv(""dogsVScats.csv"", index = False)",Yes,3,12.0
"import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing",No,5,22.0
"# Load dataset 
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sampleSubmission.csv')
weather = pd.read_csv('../input/weather.csv')",No,5,45.0
sum(train['building_id']=='0'),No,5,72.0
"# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)",No,5,10.0
"train[""created""] = pd.to_datetime(train[""created""])
train[""month_created""] = train[""created""].dt.month",Yes,3,16.0
"train[""month_created""]",No,5,41.0
train['month_created'].value_counts(),No,5,72.0
"# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')",Yes,3,14.0
"def bar_chart(feature):<br>    low = train[train['interest_level']=='low'][feature].value_counts() # survived     <br>    medium = train[train['interest_level']=='medium'][feature].value_counts()<br>    high = train[train['interest_level']=='high'][feature].value_counts()<br>    df = pd.DataFrame([low, medium, high])<br>    df.index = ['low','medium','high']<br>    df.plot(kind='bar',stacked=True, figsize=(10,5))",Yes,3,33.0
"train[""created""] = pd.to_datetime(train[""created""])<br>train[""date_created""] = train[""created""].dt.date<br>cnt_srs = train[\'date_created\'].value_counts()<br><br>plt.figure(figsize=(12,4))<br>ax = plt.subplot(111)<br>ax.bar(cnt_srs.index, cnt_srs.values, alpha=0.8)<br>ax.xaxis_date()<br>plt.xticks(rotation=\'vertical\')<br>plt.show()",Yes,4,33.0
"train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)",No,5,8.0
"# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)",No,5,16.0
train['day_of_week'] = train['created'].dt.weekday,No,5,8.0
"# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)",No,5,10.0
"test[""created""] = pd.to_datetime(test[""created""])<br>test[\'day_of_week\'] = test[\'created\'].dt.weekday",Yes,3,8.0
train['created_day'] = train['created'].dt.day,No,5,8.0
test['created_day'] = test['created'].dt.day,No,5,8.0
"### Iterest per Day of Week<br>fig = plt.figure(figsize=(12,6))<br>sns.countplot(x=""created_day"", hue=""interest_level"", hue_order=[\'low\', \'medium\', \'high\'], data=train);<br>plt.xlabel(\'created_day\');<br>plt.ylabel(\'Number of occurrences\');",No,5,75.0
"# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)",Yes,3,32.0
"# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)",No,5,20.0
"train[""num_features""] = train[""features""].apply(len)
test[""num_features""] = test[""features""].apply(len)",No,5,8.0
"llimit = np.percentile(train.latitude.values, 1)
ulimit = np.percentile(train.latitude.values, 99)
train['latitude'].ix[train['latitude']<llimit] = llimit
train['latitude'].ix[train['latitude']>ulimit] = ulimit

plt.figure(figsize=(8,6))
sns.distplot(train.latitude.values, bins=50, kde=False)
plt.xlabel('latitude', fontsize=12)
plt.show()",Yes,3,33.0
"llimit = np.percentile(train.longitude.values, 1)
ulimit = np.percentile(train.longitude.values, 99)
train['longitude'].ix[train['longitude']<llimit] = llimit
train['longitude'].ix[train['longitude']>ulimit] = ulimit

plt.figure(figsize=(8,6))
sns.distplot(train.longitude.values, bins=50, kde=False)
plt.xlabel('longitude', fontsize=12)
plt.show()",Yes,3,33.0
"lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)",No,5,20.0
train['price'],No,5,41.0
"lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)",No,5,20.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'price\', shade=True)<br>facet.add_legend()<br>plt.show()",No,5,33.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'price\', shade=True)<br>facet.set(xlim=(0,100000))<br>facet.add_legend()<br>plt.show()",No,5,33.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'price\', shade=True)<br>facet.set(xlim=(100000,200000))<br>facet.add_legend()<br>plt.show()",No,5,33.0
"# drop columns with -1s
train = train.ix[:,(train != -1).any(axis=0)]
test = test.ix[:,(test != -1).any(axis=0)]",No,4,10.0
"facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)<br>facet.map(sns.kdeplot, \'price\', shade=True)<br>facet.set(xlim=(0,10000))<br>facet.add_legend()<br>plt.show()",No,5,33.0
"# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=1, n_estimators=1000, min_samples_split= 2)
clf.fit(train, labels)",No,5,7.0
"features_drop = ['building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos', 'street_address', 'month_created', 'date_created']
train1 = train.drop(features_drop, axis=1)",No,5,10.0
"features_drop = ['building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos', 'street_address']
test1 = test.drop(features_drop, axis=1)",No,5,10.0
"X = train[['bathrooms','bedrooms','latitude','longitude','price','day_of_week','created_day','num_features']]

y = train1['interest_level']",No,5,21.0
"from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)",Yes,4,22.0
"X = test[['bathrooms','bedrooms','latitude','longitude','price','day_of_week','created_day','num_features']]

y = clf.predict_proba(X)",Yes,3,21.0
"t = 732
m=8
n=8",No,5,77.0
"df = pd.read_csv('../input/data.txt',skiprows=2,sep=' ',names=list(map(str,(list(range(n))))))",No,5,45.0
"sub = pd.DataFrame()
sub[""listing_id""] = test[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv(""submission_rf.csv"", index=False)",Yes,3,45.0
"from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import json",No,5,22.0
"with open('../input/train.json') as f:
    raw_train = json.load(f)
with open('../input/test.json') as f:
    raw_test = json.load(f)",No,5,44.0
softmax = LogisticRegression(**m_params),No,5,4.0
"test_x = [j for i in sorted(val.keys()) for j in val[i]]
true = [i for i in sorted(val.keys()) for j in val[i]]",No,5,53.0
pred = softmax.predict(tfidf.transform(test_x)),No,5,48.0
"accuracy_score(true, pred)",No,5,49.0
"lab = LabelEncoder()
c_true = lab.fit_transform(true)
c_pred = lab.transform(pred)
print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))",No,4,20.0
"sub_df.to_csv('softmax_reg.csv', index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
% matplotlib inline
from datetime import datetime
from scipy import stats
pd.options.mode.chained_assignment = None
from scipy.stats import norm, skew
warnings.filterwarnings(""ignore"", category=DeprecationWarning)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings",No,5,23.0
"train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')",No,5,45.0
test.head(),No,5,41.0
train.describe(),No,5,40.0
"print('Train Dataset Shape : {0}'.format(train.shape))
print('Test Dataset Shape : {0}'.format(test.shape))",No,5,58.0
train.dtypes,No,5,70.0
sns.boxplot(train['count']),No,5,33.0
new_df = pd.DataFrame(new_df),No,5,12.0
"train = train[np.abs(train[""count""]-train[""count""].mean())<=(3*train[""count""].std())] ",Yes,4,14.0
"fig,ax = plt.subplots(2,1,figsize = (10,10))<br>sns.distplot(train[\'count\'],ax=ax[0])<br>stats.probplot(train[""count""], dist=\'norm\', fit=True, plot=ax[1])<br>print(\'Skewness : {0}\'.format(train[\'count\'].skew()))<br>print(\'Kurt : {0}\'.format(train[\'count\'].kurt()))",No,5,33.0
new_df['m'] = new_df['t'].apply(lambda x : int(x)%8),No,5,8.0
"fig,ax = plt.subplots(2,1,figsize = (10,10))<br>#logcount = np.log1p(train[\'count\']).kurt()<br>#rootcount = np.sqrt(train[\'count\']).kurt()<br>#cubiccount = np.power(train[\'count\'],2).kurt()<br>#minVal = min([logcount, rootcount, cubiccount])<br>#if logcount == minVal:<br>best = \'log\'<br>train[\'count_log\'] = np.log1p(train[\'count\'])<br>sns.distplot(train[\'count_log\'],ax=ax[0])<br>stats.probplot(train[""count_log""], dist=\'norm\', fit=True, plot=ax[1])<br>#elif rootcount == minVal:<br>    #best = \'root\'<br>    #train[\'count_root\'] = np.sqrt(train[\'count\'])<br>    #sns.distplot(train[\'count_root\'],ax=ax[0])<br>    #stats.probplot(train[""count_root""], dist=\'norm\', fit=True, plot=ax[1])<br>#elif cubiccount == minVal:<br>    #best = \'cubic\'<br>    #train[\'count_cubic\'] = np.power(train[\'count\'],2)<br>    #sns.distplot(train[\'count_cubic\'],ax=ax[0])<br>    #stats.probplot(train[""count_cubic""], dist=\'norm\', fit=True, plot=ax[1])<br>#print(\'For count, the Best TF is \' + best)",No,5,33.0
new_df['t'] = new_df['t'].apply(lambda x:x//8),No,5,8.0
new_df['hour'] = new_df['t'].apply(lambda x : x%24 ),No,5,8.0
"train['date']  = train.datetime.apply(lambda x: x.split()[0])
train['hour'] = train.datetime.apply(lambda x: x.split()[1].split(':')[0])
train['weekday'] = train.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())
train['month'] = train.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month)
train = train.drop('datetime',axis=1)",No,4,8.0
new_df['day'] = new_df['t'].apply(lambda x : x//24 ),No,5,8.0
train.shape,No,5,58.0
"categorical = [\'date\',\'weekday\',\'month\',\'hour\',\'season\',\'holiday\',\'workingday\',\'weather\']<br>numeric = [""temp"",""atemp"",""casual"",""registered"",""humidity"",""windspeed"",""count"",""count_log""]",No,5,77.0
"new_df['mm']=new_df['m']
new_df['nn']= new_df['n']",No,5,8.0
new_df['id'] = new_df['t'].map(str)+':'+new_df['mm'].map(str)+':'+new_df['nn'].map(str),No,5,8.0
"for idx in categorical:
    train[idx].astype('category')",Yes,4,16.0
"fig,axes = plt.subplots(ncols=2 ,nrows=2)
fig.set_size_inches(15,10)
sns.boxplot(data=train,x='season',y='count',ax=axes[0][0])
sns.boxplot(data=train,x='holiday',y='count',ax=axes[0][1])
sns.boxplot(data=train,x='workingday',y='count',ax=axes[1][0])
sns.boxplot(data=train,x='weather',y='count',ax=axes[1][1])

fig1,axes1 = plt.subplots()
fig1.set_size_inches(15,10)
sns.boxplot(data=train,x='hour',y='count')",No,5,33.0
"plt.subplots(figsize=(15,8))
sns.heatmap(train[numeric].corr(),annot=True)",No,5,80.0
"corr = train[numeric].drop(\'count\', axis=1).corr()<br>corr =corr.drop(\'count_log\', axis=1).corr() # We already examined SalePrice correlations<br>plt.figure(figsize=(12, 10))<br>sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], <br>            cmap=\'viridis\', vmax=1.0, vmin=-1.0, linewidths=0.1,<br>            annot=True, annot_kws={""size"": 8}, square=True);",No,5,80.0
new_df = df_r.set_index('id').join(new_df.set_index('id')),No,4,32.0
"### count,month
plt.figure(figsize=(15,8))
monthagg = pd.DataFrame(train.groupby('month')['count'].mean()).reset_index()
sns.barplot(data=monthagg, x='month',y='count').set(title = 'Month Vs Count')",No,4,33.0
"from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0
"### count,season,hour
plt.figure(figsize=(15,8))
houragg = pd.DataFrame(train.groupby(['hour','season'])['count'].mean()).reset_index()
sns.pointplot(data=houragg,x=houragg['hour'],y=houragg['count'],hue=houragg['season']).set(title='Hour,Season Vs Count')",Yes,4,33.0
"scaler.transform(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0
"### count,hour,weekday
plt.figure(figsize=(15,8))
hourweekagg = pd.DataFrame(train.groupby(['hour','weekday'])['count'].mean()).reset_index()
sns.pointplot(data=hourweekagg,x=hourweekagg['hour'],y=hourweekagg['count'],hue=hourweekagg['weekday']).set(title='Hour,Week Vs Count')",Yes,4,33.0
"from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb",No,5,22.0
"new_df[['m1','m2','m3','m4','n','m','hour','day']]=scaler.transform(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0
"target = train['count']
target_log=train['count_log']
train = train.drop('count_log',axis=1)
train = train.drop('count',axis=1)
train = train.drop('atemp',axis=1)
train = train.drop('date',axis=1)
train = train.drop('casual',axis=1)
train = train.drop('registered',axis=1)
m_dum = pd.get_dummies(train['month'],prefix='m')
ho_dum = pd.get_dummies(train['hour'],prefix='ho')
s_dum = pd.get_dummies(train['season'],prefix='s')
we_dum = pd.get_dummies(train['weather'],prefix='we')
train = pd.concat([train,s_dum,we_dum,m_dum,ho_dum],axis=1)

testid = test['datetime']
test['date']  = test.datetime.apply(lambda x: x.split()[0])
test['hour'] = test.datetime.apply(lambda x: x.split()[1].split(':')[0])
test['weekday'] = test.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())
test['month'] = test.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month)
test = test.drop('datetime',axis=1)
test = test.drop('atemp',axis=1)
test = test.drop('date',axis=1)
s_dum = pd.get_dummies(test['season'],prefix='s')
we_dum = pd.get_dummies(test['weather'],prefix='we')
m_dum = pd.get_dummies(test['month'],prefix='m')
ho_dum = pd.get_dummies(test['hour'],prefix='ho')
test= pd.concat([test,s_dum,we_dum,m_dum,ho_dum],axis=1)",Yes,3,10.0
train = new_df[new_df['val'] !=-1],No,5,14.0
"X_train = train.drop(['val','t','mm','nn'], axis=1)
y_train = train['val'].values",No,5,21.0
test = new_df[new_df['val'] == -1],No,5,14.0
"X_test = test.drop(['val','t','mm','nn'], axis=1)",No,5,10.0
"gbr = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01, max_depth=4).fit(train.values, target_log)",No,5,7.0
"import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam",No,5,22.0
"model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(8,)))
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1))
model.summary()",No,5,84.0
"model_gbr = GradientBoostingRegressor(n_estimators=1500,max_depth=5,learning_rate=0.01).fit(train.values,target_log)",No,5,7.0
"prediction = model_gbr.predict(test.values)
prediction = np.expm1(prediction)",Yes,4,48.0
"output = pd.DataFrame()
output['datetime'] = testid
output['count'] = prediction
output.to_csv('output.csv',index=False)",Yes,3,25.0
"import glob
import os, sys
import random
from tqdm import tqdm

import numpy as np
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications

import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt

from keras import backend as K
K.tensorflow_backend._get_available_gpus()",No,5,23.0
"train_data_dir = '../input/dogs-vs-cats-redux-kernels-edition/train'
test_data_dir = '../input/dogs-vs-cats-redux-kernels-edition/test'

# Make sure you include https://www.kaggle.com/keras/vgg16/data as your data source
vgg_model_path = '../input/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'

epochs = 20
batch_size = 20
img_width, img_height = 150, 150

training_n_bound = 5000  # set to None to use the entire training dataset; it took about 2 hours at my Macbook Pro.",No,5,77.0
"# Wrap training data into pandas' DataFrame.
lst = list(gen_image_label(train_data_dir))
random.shuffle(lst)
if training_n_bound is not None:
    lst = lst[:training_n_bound]
train_df = pd.DataFrame(lst, columns=['label', 'id', 'filename'])
train_df = train_df.sort_values(by=['label', 'id'])
train_df.head(3)",Yes,3,12.0
"train_df['label_code'] = train_df.label.map({'cat':0, 'dog':1})
train_df.head(3)",Yes,3,41.0
"# Wrap testing data into pandas' DataFrame.
lst = list(gen_image_label(test_data_dir))
test_df = pd.DataFrame(lst, columns=['label', 'id', 'filename'])
test_df = test_df.sort_values(by=['label', 'id'])
test_df['label_code'] = test_df.label.map({'cat':0, 'dog':1})

test_df.head(3)",Yes,3,12.0
"sns.countplot(train_df.label)
plt.title('Number of training images per category')",No,5,33.0
"model.fit(train_embeddings[train_indices,:],
          train_labels,
          epochs=epochs,
          batch_size=batch_size,
          validation_data=(train_embeddings[validate_indices,:],
                           validation_labels))
model.save_weights(embedding_fc_model)",Yes,3,7.0
"from sklearn.metrics import f1_score, accuracy_score

pred_validation = model.predict(train_embeddings[validate_indices,:])

f1 = f1_score(validation_labels, pred_validation > 0.5)
acc = accuracy_score(validation_labels, pred_validation > 0.5)
(f1, acc)",Yes,4,49.0
"pred_test = model.predict(test_embeddings)
pred_test.shape",Yes,3,48.0
"results = pd.DataFrame({'id': pd.Series(test_df.id.values[:pred_test.shape[0]]),
                        'label': pd.Series(pred_test.T[0])})
results.to_csv('submission.csv', index=False)
results.head(10)",Yes,4,12.0
"df = pd.DataFrame(np.concatenate((X,
                                  train_labels[:train_embeddings[train_indices,:].shape[0]].reshape(train_embeddings[train_indices,:].shape[0],1)),
                                 axis=1),
                  columns=['X', 'Y', 'Z', 'label'])",No,5,12.0
"g = sns.FacetGrid(df, hue=""label"", size=7)
g.map(plt.scatter, ""X"", ""Y"", alpha=.5)
g.add_legend();

g = sns.FacetGrid(df, hue=""label"", size=7)
g.map(plt.scatter, ""Y"", ""Z"", alpha=.5)
g.add_legend();

g = sns.FacetGrid(df, hue=""label"", size=7)
g.map(plt.scatter, ""X"", ""Z"", alpha=.5)
g.add_legend();",No,5,33.0
"from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from scipy import sparse
from sklearn.svm import LinearSVC
import json
from sklearn.model_selection import train_test_split
import copy",No,5,22.0
"train, val = train_test_split(raw_train, test_size=0.2, random_state=2018)",No,5,13.0
"pred = []
test_pred = []
for i in range(3):
    p.get_params()['lr'].class_weight = {0: 1, 1:w[i] }
    p.fit(train_x, train_y[i])
    pred.append(p.decision_function(val_x))
    test_pred.append(p.decision_function(test_x))",Yes,3,7.0
"accuracy_score(val_y, np.argmax(np.array(pred), axis=0))",No,5,49.0
"sub_df = pd.DataFrame()
sub_df['id'] =  [i['id'] for i in raw_test]
sub_df['sentiment'] = np.argmax(np.array(test_pred), axis=0)
sub_df['sentiment']= sub_df['sentiment'].apply(lambda x: lab.classes_[x])",Yes,4,8.0
"sub_df.to_csv('nb.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>from os import listdir<br>from os.path import join, basename<br>from PIL import Image<br>print(listdir(""../input""))<br>print(listdir("".""))<br>IMG_HEIGHT = 50<br>IMG_WIDTH = 50<br>NUM_CHANNELS = 3<br><br>from threading import current_thread, Thread, Lock<br>from multiprocessing import Queue<br># Any results you write to the current directory are saved as output.",Yes,4,22.0
"# initializations related to threading stuff
batch_size = 500
num_train_images = 25000
num_test_images = 12500
num_train_threads = int(num_train_images/batch_size)  # 50
num_test_threads = int(num_test_images/batch_size)    # 25
lock = Lock()",No,4,77.0
"print(train_x.shape)
print(len(train_y))",No,5,58.0
"test_x =get_testing_data()
print(test_x.shape)",Yes,4,58.0
"print(""train_x shape"",train_x.shape)
print(""test_x shape"", test_x.shape)
# convert train_y to np. array
train_y = np.array(train_y)
print(""train_y.shape"", train_y.shape)",No,5,58.0
"# mean normalize train and test images
train_x = train_x/255
test_x = test_x/255",No,5,8.0
"# import required packages
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils, to_categorical

from sklearn.model_selection import train_test_split",No,5,22.0
"from scipy import sparse <br>from sklearn.feature_extraction.text import CountVectorizer<br>tfidf = CountVectorizer(stop_words=\'english\', max_features=200)<br>tfidf.fit(list(train_df[\'features\']) + list(test_df[\'features\']))<br>tr_sparse = tfidf.transform(train_df[""features""])<br>te_sparse = tfidf.transform(test_df[""features""])<br>print(te_sparse)<br>",No,4,8.0
"print(train_df[features_to_use].head())
",No,5,41.0
"train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
",No,4,32.0
"test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()
",No,4,32.0
"target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)
",No,4,20.0
"preds, model = runXGB(train_X, train_y, test_X, num_rounds=818)
out_df = pd.DataFrame(preds)
out_df.columns = [""high"", ""medium"", ""low""]
out_df[""listing_id""] = test_df.listing_id.values
out_df.to_csv(""xgb_starter2.csv"", index=False)
",No,4,55.0
"%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F",No,5,23.0
torch.__version__,No,4,23.0
from torch.autograd import Variable,No,5,22.0
"use_gpu = torch.cuda.is_available()
use_gpu",No,4,77.0
!ls ../input/cifar10-python/,No,5,88.0
!tar -zxvf ../input/cifar10-python/cifar-10-python.tar.gz,No,5,73.0
"model.fit(X_train, y_train, batch_size=128, epochs=240, verbose=1,validation_split=0.2)",No,5,7.0
"from xgboost import XGBRegressor
model_XGB = XGBRegressor()
model_XGB.fit(X_train,y_train)",Yes,4,4.0
predict = model.predict(X_test),No,5,48.0
X_test['demand'] = X_test['pred'].apply(make_positive),No,5,8.0
"X_test[['id','demand']].to_csv('result.csv',index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import NearestCentroid, RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy import stats
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import decomposition, cross_decomposition

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# number of attributes for pca
decompose_to = 5",No,5,88.0
"
# show data on 2d plots
positives = train_data_pca[train_labels[:]==1]
negatives = train_data_pca[train_labels[:]==0]
positivest = test_data_pca[test_predicted[:]==1]
negativest = test_data_pca[test_predicted[:]==0]

for j in range(decompose_to):
    plt.figure(figsize=(20,15))
    for i in range(decompose_to):
        plt.subplot(4,5,i+1)
        axis = [j,i]
        a=positives[:,axis]
        plt.scatter(*zip(*a), color='r')
        a=negatives[:,axis]
        plt.scatter(*zip(*a), color='b')
        plt.title(str(axis))
    plt.show()
    plt.figure(figsize=(20,15))
    for i in range(decompose_to):
        plt.subplot(4,5,i+1)
        axis = [j,i]
        a=positivest[:,axis]
        plt.scatter(*zip(*a), color='g')
        a=negativest[:,axis]
        plt.scatter(*zip(*a), color='c')
        plt.title(str(axis))
    plt.show()",No,5,33.0
"# PREDICTING THE EFFECTS OF GENETIC VARIATIONS USING LGBM
# BY - OMKAR SABNIS - 29-05-2018
#Importing library
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix,mean_squared_error
from sklearn.model_selection import KFold, cross_val_score,train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb",No,5,22.0
"# READING THE DATASETS<br>train = pd.read_csv(""../input/training_variants"")<br>trainx = pd.read_csv(\'../input/training_text\',sep = \'\\|\\|\', engine= \'python\', header=None, <br>                     skiprows=1, names=[""ID"",""Text""])<br>train = pd.merge(train, trainx, how = \'left\', on = \'ID\').fillna(\'\')<br>train.head()",Yes,3,45.0
"test = pd.read_csv(""../input/stage2_test_variants.csv"")<br>testx = pd.read_csv(\'../input/stage2_test_text.csv\',sep = \'\\|\\|\', engine= \'python\', header=None, <br>                     skiprows=1, names=[""ID"",""Text""])<br>test = pd.merge(test, testx, how = \'left\', on = \'ID\').fillna(\'\')<br>test.head()",Yes,3,45.0
"def textlen(train):
    k = train['Text'].apply(lambda x: len(str(x).split()))
    l = train['Text'].apply(lambda x: len(str(x)))
    return k, l

train['Text_no_word'], train['Text_no_char'] = textlen(train)
test['Text_no_word'], test['Text_no_char'] = textlen(test)",No,5,8.0
"tfidf = TfidfVectorizer(<br>\tmin_df=1, max_features=1600, strip_accents='unicode',lowercase =True,<br>\tanalyzer='word', token_pattern=r'\\w+', ngram_range=(1, 3), use_idf=True, <br>\tsmooth_idf=True, sublinear_tf=True, stop_words = 'english')<br>X_train = tfidf.fit_transform(train['Text']).toarray()<br>print(X_train)<br>X_test = tfidf.fit_transform(test['Text']).toarray()<br><br>def encoding(df,col):<br>    le = LabelEncoder()<br>    for i in col:<br>        df[i] = le.fit_transform(df[i])<br>train.columns<br>col = ['Gene', 'Variation', 'Class']<br>encoding(train,col)<br>encoding(test,['Gene', 'Variation'])<br><br>X_train = pd.DataFrame(X_train)<br>X_train = X_train.join(train[['Gene', 'Variation', 'Text_no_word','Text_no_char']]) <br>X_test = pd.DataFrame(X_test)<br>X_test = X_test.join(test[['Gene', 'Variation', 'Text_no_word','Text_no_char']])",Yes,3,8.0
"# FEATURE SCALING
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
y_train = train['Class']",No,5,21.0
"xtr,xvl,ytr,yvl = train_test_split(X_train,y_train,test_size=0.3,random_state=10)",No,5,13.0
predictions = [0 if i < 0 else i for i in predictions],No,1,53.0
"output = pd.DataFrame({'datetime': test_data.index,
                       'count': predictions})
output.to_csv('submission.csv', index=False)",Yes,3,55.0
"# SUBMISSION OF FILE IN CSV FORMAT:
submit = pd.DataFrame(test.ID)
submit = submit.join(pd.DataFrame(pred_test))
submit.columns = ['ID', 'class1','class2','class3','class4','class5','class6','class7','class8','class9']
submit.to_csv('submission.csv', index=False) ",Yes,3,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import time
import lightgbm as lgb
import math
from sklearn.metrics import mean_squared_error,mean_absolute_error,log_loss
import sklearn
from sklearn.ensemble import RandomForestClassifier
import itertools
import xgboost as xgb
import random
import datetime
from wordcloud import WordCloud
import re

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline",No,5,23.0
"train = pd.read_json('../input/train.json')
test = pd.read_json('../input/test.json')",No,5,44.0
"merge.loc[merge['bathrooms'] > 7 , 'bathrooms'] = 7
merge['rooms'] = merge['bathrooms'] + merge['bedrooms']
merge['rooms_diff'] = merge['bathrooms'] - merge['bedrooms']
merge['half_bathrooms'] = ((merge['rooms'] - np.floor(merge['rooms'])) > 0).astype(int)
features_to_use = np.concatenate([features_to_use, ['bathrooms', 'bedrooms', 'rooms', 'rooms_diff', 'half_bathrooms']])
features_to_use = np.unique(features_to_use)
features_to_use",No,4,8.0
"merge.loc[merge[\'latitude\'] < 1, \'latitude\'] = merge[\'latitude\'].mode()[0]<br>merge.loc[merge[\'longitude\']>-1, \'longitude\'] = merge[\'longitude\'].mode()[0]<br>scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))<br><br>merge[\'latitude\'] = scaler.fit_transform(np.array(merge[\'latitude\']).reshape(-1,1))<br>scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))<br>merge[\'longitude\'] = scaler.fit_transform(np.array(merge[\'longitude\']).reshape(-1,1))<br><br>merge[""pos""] = merge.longitude.round(3).astype(str) + \'_\' + merge.latitude.round(3).astype(str)<br>pos_vc = merge[\'pos\'].value_counts()<br>d_pos_vc = pos_vc.to_dict()<br>merge[\'density\'] = merge[""pos""].apply(lambda x: d_pos_vc.get(x, pos_vc.min()))<br><br>features_to_use = np.concatenate([features_to_use, [\'latitude\', \'longitude\', \'density\']])<br>features_to_use = np.unique(features_to_use)<br>features_to_use",Yes,2,18.0
"merge[\'num_description_len\'] = merge[\'description\'].str.len()<br>merge[\'num_description_words\'] = merge[\'description\'].apply(lambda x:len(x.split(\' \')))<br>merge[\'price_per_bedrooms\'] = merge[\'price\']/merge[\'bedrooms\']<br>merge[\'price_per_bathrooms\'] = merge[\'price\']/merge[\'bathrooms\']<br>merge[\'price_per_rooms\'] = merge[\'price\']/merge[\'rooms\']<br>merge[\'beds_percent\'] = merge[\'bedrooms\']/merge[\'rooms\']<br>merge[\'num_capital_letters\'] = merge[\'description\'].apply(lambda x: sum(1 for c in x if c.isupper()))<br>merge[\'num_address_len\'] = merge[\'display_address\'].str.len()<br>merge[\'num_address_words\'] = merge[\'display_address\'].apply(lambda x:len(x.split(\' \')))<br>merge[\'address_east\'] = merge[\'street_address\'].apply(lambda x: x.find(\'East\') > -1).astype(int)<br>merge[\'address_west\'] = merge[\'street_address\'].apply(lambda x: x.find(\'West\') > -1).astype(int)<br>merge[\'num_photos\'] = merge[\'photos\'].str.len()<br>merge[\'num_features\'] = merge[\'features\'].str.len()<br>merge[\'num_photos_low\'] = merge[\'num_photos\'].apply(lambda x:1 if x > 22 else 0)  # all is low<br>merge[\'price_low_medium\'] = merge[\'price\'].apply(lambda x:1 if 7500< x < 10000 else 0)  # all is low or medium<br>merge[\'price_low\'] = merge[\'price\'].apply(lambda x:1 if x >= 10000 else 0)  # all is low<br>def cap_share(x):<br>    return sum(1 for c in x if c.isupper())/float(len(x) + 1)<br>merge[\'num_cap_share\'] = merge[\'description\'].apply(cap_share)<br>merge[\'num_description_lines\'] = merge[\'description\'].apply(lambda x: x.count(\'<br /><br />\'))<br>merge[\'num_redacted\'] = 0<br>merge[\'num_redacted\'].ix[merge[\'description\'].str.contains(\'website_redacted\')] = 1<br>merge[\'num_email\'] = 0<br>merge[\'num_email\'].ix[merge[\'description\'].str.contains(\'@\')] = 1<br><br>reg = re.compile("".*?(\\(?\\d{3}\\D{0,3}\\d{3}\\D{0,3}\\d{4}).*?"", re.S)<br>def try_and_find_nr(description):<br>    if reg.match(description) is None:<br>        return 0<br>    return 1<br>merge[\'num_phone_nr\'] = merge[\'description\'].apply(try_and_find_nr)<br><br><br><br>features_to_use = np.concatenate([features_to_use, [\'num_description_len\', \'num_description_words\',<br>                                               \'price_per_bedrooms\', \'price_per_bathrooms\', \'price_per_rooms\', \'num_photos\', \'num_features\',<br>                                               \'num_photos_low\', \'price_low_medium\', \'price_low\',<br>                                                   \'beds_percent\', \'num_capital_letters\', \'num_address_len\',<br>                                                    \'num_address_words\', \'address_east\', \'address_west\',<br>                                                   \'num_cap_share\', \'num_description_lines\', <br>                                                    \'num_redacted\', \'num_email\', \'num_phone_nr\']])<br>features_to_use = np.unique(features_to_use)<br>features_to_use",Yes,4,8.0
"interest_level_dict = {'low' : 0, 'medium' : 1, 'high' : 2 }
merge['interest'] = merge['interest_level'].map(interest_level_dict)",No,5,20.0
"created_time = pd.to_datetime(merge['created'],format='%Y-%m-%d %H:%M:%S')
merge['month'] = created_time.dt.month
merge['day'] = created_time.dt.day
merge['hour'] = created_time.dt.hour
merge['weekday'] = created_time.dt.weekday
merge['week'] = created_time.dt.week
merge['quarter'] = created_time.dt.quarter
merge['weekend'] = ((merge['weekday'] == 5) | (merge['weekday'] == 6))
merge['days_since'] = created_time.max() - created_time
merge['days_since'] = (merge['days_since'] / np.timedelta64(1, 'D')).astype(int)

features_to_encode = np.concatenate([features_to_encode, ['month', 'day', 'hour', 'weekday', 'week', 'quarter', 'hour', 'weekend']])
features_to_encode = np.unique(features_to_encode)
features_to_encode",Yes,4,8.0
"display_address_min_df = 10
street_address_min_df = 10
features_min_df = 10
description_max_features = 20",No,5,77.0
"cv = CountVectorizer(min_df=display_address_min_df)
X_display_address = cv.fit_transform(merge['display_address'])

cv = CountVectorizer(min_df=street_address_min_df)
X_street_address = cv.fit_transform(merge['street_address'])


merge['features_'] = merge['features'].apply(lambda x:' '.join(['_'.join(k.split(' ')) for k in x]))
cv = CountVectorizer(stop_words='english', max_features=200)
X_features = cv.fit_transform(merge['features_'])


tv = TfidfVectorizer(max_features=description_max_features,
                    ngram_range=(1, 5),
                    stop_words='english')
X_description = tv.fit_transform(merge['description'])

X_vectorized = hstack((X_display_address, X_street_address, X_features, X_description)).tocsr()",Yes,4,8.0
"ohe = sklearn.preprocessing.OneHotEncoder()
X_encode = ohe.fit_transform(merge[features_to_encode])",No,5,20.0
"def union_features(features_to_use, X_encode, X_vectorized, target, nrow_train):
    X_origin = merge[features_to_use]
    X_origin.fillna(0 ,inplace=True)
    X = hstack((X_origin, X_encode, X_vectorized)).tocsr()
    y = merge[target]

    X_train_all = X[:nrow_train]
    X_test = X[nrow_train:]
    y_train_all = y[:nrow_train]
    # y_test = y[nrow_train:]


    X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2 , random_state=10)
    return X_train, X_test, X_val, y_train, y_val 

X_train, X_test, X_val, y_train, y_val =  union_features(features_to_use, X_encode, X_vectorized, target, nrow_train)",Yes,4,21.0
merge.info(),No,5,40.0
"Y_pred = model.predict(xgb.DMatrix(X_test))
ids = np.array(test['listing_id'])",No,5,48.0
"preds = pd.DataFrame({""listing_id"": ids, ""high"":Y_pred[:, 0],<br>                      ""medium"":Y_pred[:, 1], ""low"":Y_pred[:, 2]})<br>preds.to_csv(\'my_submission.csv\' ,index=False)",Yes,4,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import graphviz 
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from pandas import read_csv
%matplotlib inline",Yes,3,22.0
"predict = pd.DataFrame(index=output['datetime'])
predict['count'] = output['count'].values
predict.head()
plot_timeseries_train_and_predict(train_data, predict, 2011, 2)",No,2,41.0
"fig = plt.figure(figsize=(16,9))
plt.plot(train_data.index, train_data['count'], 'b', label = 'train')
plt.plot(output['datetime'],output['count'], 'r', label = 'test')
plt.title('Train and Test')
plt.legend()
plt.grid()",No,5,75.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import graphviz 
from sklearn.preprocessing import Imputer
from sklearn import preprocessing  #hy
from sklearn.preprocessing import StandardScaler #hy
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.impute import SimpleImputer
from pandas import read_csv
%matplotlib inline",Yes,4,22.0
"def plot_decision_tree(a,b):<br>    """"""<br>    http://scikit-learn.org/stable/modules/tree.html<br>    """"""<br>    dot_data = tree.export_graphviz(a, out_file=None, feature_names=b,class_names=[\'Healthy\',\'Diabetes\'],filled=False, rounded=True,special_characters=False)  <br>    graph = graphviz.Source(dot_data)  <br>    return graph <br><br>def plot_confusion_matrix(cm, classes,normalize=False,title=\'Confusion matrix\',cmap=plt.cm.Blues):<br>    """"""<br>    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html<br>    """"""<br>    plt.imshow(cm, interpolation=\'nearest\', cmap=cmap)<br>    plt.title(title)<br>    plt.colorbar()<br>    tick_marks = np.arange(len(classes))<br>    plt.xticks(tick_marks, classes, rotation=45)<br>    plt.yticks(tick_marks, classes)<br>    fmt = \'.2f\' if normalize else \'d\'<br>    thresh = cm.max() / 2.<br>    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):<br>        plt.text(j, i, format(cm[i, j], fmt),horizontalalignment=""center"",color=""white"" if cm[i, j] > thresh else ""black"")<br>    plt.tight_layout()<br>    plt.ylabel(\'True label\')<br>    plt.xlabel(\'Predicted label\')<br>    <br>models = []<br>models.append((""LR"",LogisticRegression()))<br>models.append((""NB"",GaussianNB()))<br>models.append((""KNN"",KNeighborsClassifier()))<br>models.append((""DT"",DecisionTreeClassifier()))<br>models.append((""SVM"",SVC()))",Yes,1,80.0
"dataset = read_csv('../input/train.csv')
dataset=dataset[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']]
dataset.head(10)",Yes,2,41.0
"dataset2 = dataset.iloc[:, :-1]<br>print(""# of Rows, # of Columns: "",dataset2.shape)<br>print(""\<br>Column Name           # of Null Values\<br>"")<br>print((dataset2[:] == 0).sum())",No,4,39.0
"trainingData = read_csv(\'../input/train.csv\') <br>trainingData=trainingData[[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\',\'Outcome\']]<br>testingData = read_csv(\'../input/test.csv\')<br>testingData=testingData[[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']]<br>trainingFeatures = trainingData.iloc[:, :-1]<br>trainingLabels = trainingData.iloc[:, -1]<br>imputer = SimpleImputer(missing_values=0,strategy=\'median\')<br>trainingFeatures = imputer.fit_transform(trainingFeatures)<br>trainingFeatures = pd.DataFrame(trainingFeatures)<br>trainingFeatures.columns=[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']<br>#further feature engineering<br>#trainingData[\'Glucose\'] = ((trainingData[\'Glucose\'] >= 160)|(trainingData[\'Glucose\'] <= 100)).astype(int)  #hy<br>#trainingData[\'Pregnancies\'] = (trainingData[\'Pregnancies\'] >= 5).astype(int)<br>#trainingData[\'Insulin\'] = (trainingData[\'Insulin\'] >= 200).astype(int)<br>#trainingData[\'DiabetesPedigreeFunction\'] = (trainingData[\'DiabetesPedigreeFunction\'] >= 0.5).astype(int)#hy<br>#print(trainingData[:])<br><br>testingData = imputer.transform(testingData)<br>testingData = pd.DataFrame(testingData)                  <br>#testingData.columns=[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']<br>#testingData[\'Glucose\'] = ((testingData[\'Glucose\'] >= 160)|(testingData[\'Glucose\'] <= 100)).astype(int)  #hy<br>#testingData[\'Pregnancies\'] = (testingData[\'Pregnancies\'] >= 5).astype(int)<br>#testingData[\'Insulin\'] = (testingData[\'Insulin\'] >= 200).astype(int)<br>#testingData[\'DiabetesPedigreeFunction\'] = (testingData[\'DiabetesPedigreeFunction\'] >= 0.5).astype(int) <br><br>print(""# of Rows, # of Columns: "",trainingFeatures.shape)<br>print(""\<br>Column Name           # of Null Values\<br>"")<br>print((trainingFeatures[:] == 0).sum())",Yes,1,12.0
"g = sns.heatmap(trainingFeatures.corr(),cmap=""Blues"",annot=False)",No,5,80.0
"#trainingFeatures2 = trainingFeatures.drop(['Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin','Id'], axis=1)
trainingFeatures2 = trainingFeatures.drop(['Id'], axis=1)",No,5,10.0
"g = sns.heatmap(trainingFeatures2.corr(),cmap=""Blues"",annot=False)
print(trainingFeatures2.corr())",Yes,3,80.0
"<br>#model = DecisionTreeClassifier(max_depth=8,min_samples_leaf=2)<br>#0.70-no norm . 0.76--w/ normalization<br>""""""<br>model = LogisticRegression(penalty=\'l2\', dual=False, tol=0.0001, <br>                           C=1.0, fit_intercept=True, intercept_scaling=1, <br>                           class_weight=None, random_state=10, solver=\'liblinear\', <br>                           max_iter=100, multi_class=\'ovr\', verbose=0, warm_start=False, n_jobs=1)  <br>""""""<br>#model =GaussianNB()   #74.17<br>#model= RandomForestClassifier(max_depth=6, random_state=0)<br>#"""""" %77.00<br>model= RandomForestClassifier(bootstrap=True, class_weight=None, criterion=\'gini\',<br>            max_features=\'auto\', max_leaf_nodes=None,<br>            min_impurity_decrease=0.0, min_impurity_split=None,<br>            min_samples_leaf=1, min_samples_split=2,<br>            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,<br>            random_state=10, verbose=0, warm_start=False)<br>#""""""<br>#model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=6), n_estimators=150, random_state=10) #74<br><br>""""""70.07<br>model = MLPClassifier(activation=\'relu\', alpha=1e-05, batch_size=\'auto\',<br>       beta_1=0.9, beta_2=0.999, early_stopping=False,<br>       epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate=\'constant\',<br>       learning_rate_init=0.001, max_iter=200, momentum=0.9,<br>       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,<br>       solver=\'lbfgs\', tol=0.0001, validation_fraction=0.1, verbose=False,<br>       warm_start=False)<br>""""""<br>#model = LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=\'auto\', solver=\'eigen\', store_covariance=False, tol=0.0001)<br>#model = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,  store_covariance=False, store_covariances=None, tol=0.0001)<br><br>#SVC: rbf(64) linear(76) poly-3<br>""""""<br>model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,<br>    decision_function_shape=\'ovr\', degree=2, gamma=\'auto\', kernel=\'linear\',<br>    max_iter=-1, probability=False, random_state=None, shrinking=True,<br>    tol=0.001, verbose=False)<br>""""""<br>#gradientBoostingClassifier . 0.77<br>#params = {\'max_depth\':9, \'subsample\':0.5, \'learning_rate\':0.01, \'min_samples_leaf\':1, \'random_state\':0}<br>#model = GradientBoostingClassifier(n_estimators=290,**params)<br><br>#model = GradientBoostingClassifier()<br><br>X_train, X_test, y_train, y_test = train_test_split(trainingFeatures2, trainingLabels, test_size=0.1, random_state=10)<br><br><br>#original<br>model.fit(X_train, y_train)<br><br>#scaler = StandardScaler()<br>#X_train_scaler = scaler.fit_transform(X_train)<br>#model.fit(X_train_scaler, y_train)<br><br>columns = trainingFeatures2.columns<br>feature_names = trainingFeatures2.columns.values<br><br>#coefficients = model.feature_importances_.reshape(trainingFeatures2.columns.shape[0], 1)<br>#absCoefficients = abs(coefficients)<br>#fullList = pd.concat((pd.DataFrame(columns, columns = [\'Feature\']), pd.DataFrame(absCoefficients, columns = [\'absCoefficient\'])), axis = 1).sort_values(by=\'absCoefficient\', ascending = False)<br>#print(\'\<br>Feature Importance:\<br>\<br>\',fullList,\'\<br>\')<br>#plot_decision_tree(model,feature_names)",Yes,1,21.0
"kfold = KFold(n_splits=10, random_state=10)<br>results = cross_val_score(model, trainingFeatures2, trainingLabels, cv=kfold)<br>#print(""DecisionTreeClassifier:\<br>\<br>Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))<br>print(""Logistic Regression Classifier:\<br>\<br>Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))<br>#origina<br>prediction = model.predict(X_test)<br><br>#X_test_scaler = scaler.fit_transform(X_test)<br>#prediction = model.predict(X_test_scaler)<br><br>cnf_matrix = confusion_matrix(y_test, prediction)<br>dict_characters = {0: \'Healthy\', 1: \'Diabetes\'}<br>plot_confusion_matrix(cnf_matrix, classes=dict_characters,title=\'Confusion matrix\')",Yes,2,48.0
"test = testingData
test = pd.DataFrame(test)
test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
#test2 = test.drop(['Id','Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin'], axis=1)
test2 = test.drop(['Id' ], axis=1)
#test2_scaler = scaler.fit_transform( test2)
my_predictions = model.predict(test2)
#my_predictions = model.predict(test2_scaler)
Identifier = test.Id.astype(int)
my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions})
my_submission.to_csv('my_submission.csv', index=False)
my_submission.head(10)",Yes,1,12.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
%matplotlib inline",Yes,3,22.0
"X = train.drop(['Outcome'], axis = 1)
y = train.Outcome",Yes,3,10.0
"# parameters = {'criterion': ('gini', 'entropy'), 'n_estimators': [10, 50, 100, 105, 150]}
# gb = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# clf = GridSearchCV(gb, parameters)
clf.fit(X,y)",Yes,2,4.0
predicted = clf.predict(test),No,5,48.0
print(predicted),No,1,53.0
"output = pd.DataFrame(predicted,columns = ['Outcome'])
test = pd.read_csv('../input/test.csv')
output['Id'] = test['Id']
output[['Id','Outcome']].to_csv('submission.csv', index = False)
output.head()",Yes,1,12.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import graphviz 
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from pandas import read_csv
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
%matplotlib inline",Yes,3,22.0
"trainingData = read_csv('../input/train.csv') 
trainingData=trainingData[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']]
testingData = read_csv('../input/test.csv')
testingData=testingData[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
trainingFeatures = trainingData.iloc[:, :-1]
trainingLabels = trainingData.iloc[:, -1]
imputer = SimpleImputer(missing_values=0,strategy='median')
trainingFeatures = imputer.fit_transform(trainingFeatures)
trainingFeatures = pd.DataFrame(trainingFeatures)
trainingFeatures.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
testingData = imputer.transform(testingData)
testingData = pd.DataFrame(testingData)
testingData.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
trainingFeatures2 = trainingFeatures.drop(['Id'], axis=1)",Yes,1,12.0
"#model = DecisionTreeClassifier(max_depth=2,min_samples_leaf=2)<br>model = GradientBoostingClassifier(n_estimators=110, max_depth=3, min_samples_split=310, min_samples_leaf=5, max_features=7, subsample=0.85, learning_rate=0.1)<br>#model = XGBClassifier( learning_rate =0.1, n_estimators=10, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= \'binary:logistic\', nthread=4, scale_pos_weight=1, seed=27)<br>kfold = KFold(n_splits=10, random_state=7)<br>results = cross_val_score(model, trainingFeatures2, trainingLabels, cv=kfold)<br>print(""DecisionTreeClassifier:\<br>\<br>Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))<br><br>model.fit(trainingFeatures2, trainingLabels)",Yes,1,4.0
"test = testingData
test = pd.DataFrame(test)
test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
test2 = test.drop(['Id'], axis=1)
my_predictions = model.predict(test2)
Identifier = test.Id.astype(int)
my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions})
my_submission.to_csv('my_submission.csv', index=False)",Yes,1,12.0
"from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
import seaborn as sns
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
",Yes,3,22.0
"data = pd.read_csv(""../input/train.csv"")",No,5,45.0
"data = data.reindex(
    np.random.permutation(data.index))
data.head()",Yes,3,41.0
"sns.heatmap(data.corr(), annot=True)",No,5,80.0
data.isnull().sum(),No,5,39.0
data['Outcome'].hist(bins = 20),No,5,33.0
"sns.pairplot(data, hue='Outcome')",No,5,33.0
"data[['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']].hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8);",No,5,33.0
"data=data[data['Pregnancies']<=11]
data=data[data['BMI']>=12]
data=data[data['BloodPressure']>40]
data=data[data['Glucose']>40]
data=data[data['SkinThickness']<60]",No,5,14.0
data.describe(),No,4,40.0
"def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                             
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels",Yes,1,12.0
"def train_linear_classifier_model(<br>    learning_rate,<br>    steps,<br>    hidden_units,<br>    batch_size,<br>    training_examples,<br>    training_targets,<br>    validation_examples,<br>    validation_targets):<br>    periods = 45<br>    steps_per_period = steps / periods<br>    # Create a linear classifier object.<br>    my_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)<br>    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)  <br>    DNN_classifier = tf.estimator.DNNClassifier(<br>      feature_columns=construct_feature_columns(training_examples),<br>      hidden_units=hidden_units,<br>      optimizer=my_optimizer<br>  )<br>    # Create input functions.<br>    training_input_fn = lambda: my_input_fn(training_examples, <br>                                          training_targets[""Outcome""], <br>                                          batch_size=batch_size)<br>    predict_training_input_fn = lambda: my_input_fn(training_examples, <br>                                                  training_targets[""Outcome""], <br>                                                  num_epochs=1, <br>                                                  shuffle=False)<br>    predict_validation_input_fn = lambda: my_input_fn(validation_examples, <br>                                                    validation_targets[""Outcome""], <br>                                                    num_epochs=1, <br>                                                    shuffle=False)<br>    <br>    # Train the model, but do so inside a loop so that we can periodically assess<br>    # loss metrics.<br>    print(""Training model..."")<br>    print(""LogLoss (on training data):"")<br>    training_log_losses = []<br>    validation_log_losses = []<br>    for period in range (0, periods):<br>        # Train the model, starting from the prior state.<br>        DNN_classifier.train(<br>        input_fn=training_input_fn,<br>        steps=steps_per_period<br>         )<br>        # Take a break and compute predictions.    <br>        training_probabilities = DNN_classifier.predict(input_fn=predict_training_input_fn)<br>        training_probabilities = np.array([item[\'probabilities\'] for item in training_probabilities])<br>    <br>        validation_probabilities = DNN_classifier.predict(input_fn=predict_validation_input_fn)<br>        validation_probabilities = np.array([item[\'probabilities\'] for item in validation_probabilities])<br>    <br>        training_log_loss = metrics.log_loss(training_targets, training_probabilities)<br>        validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)<br>        # Occasionally print the current loss.<br>        print(""  period %02d : %0.2f"" % (period, training_log_loss))<br>        # Add the loss metrics from this period to our list.<br>        training_log_losses.append(training_log_loss)<br>        validation_log_losses.append(validation_log_loss)<br>    print(""Model training finished."")<br>    # Output a graph of loss metrics over periods.<br>    plt.ylabel(""LogLoss"")<br>    plt.xlabel(""Periods"")<br>    plt.title(""LogLoss vs. Periods"")<br>    plt.tight_layout()<br>    plt.plot(training_log_losses, label=""training"")<br>    plt.plot(validation_log_losses, label=""validation"")<br>    plt.legend()<br>    return DNN_classifier",Yes,1,4.0
"DNN_classifier = train_linear_classifier_model(
    learning_rate=0.001,
    steps=800,
    batch_size=80,
    hidden_units=[10, 10,10],
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)",No,4,4.0
"testData = pd.read_csv(""../input/test.csv"")",No,5,45.0
testData.head(),No,5,41.0
"testData.isna().sum()
testData['Outcome'] =  0",Yes,4,39.0
"test_examples = preprocess_features(testData)

test_examples.head()",Yes,3,41.0
test_validations = preprocess_targets(testData),No,1,53.0
"predict_test_input_fn = lambda: my_input_fn(test_examples, <br>                                                  test_validations[""Outcome""], <br>                                                  num_epochs=1, <br>                                                  shuffle=False)<br><br>test_predictions = DNN_classifier.predict(input_fn=predict_test_input_fn)<br>test_predictions = np.array([item[\'probabilities\'][1] for item in test_predictions])<br><br>_ = plt.hist(test_predictions)",Yes,2,48.0
"testData[['Id','Outcome']].to_csv('Submit.csv', index = False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br>#id,y,Usage<br>#1,100,Public<br>#2,100,Private<br><br>df = pd.DataFrame({\'id\': [1,2], \'y\': [100,100]})<br>df.head()<br>df.to_csv(""submission.csv"", header = True, index = False)<br># Any results you write to the current directory are saved as output.",Yes,4,22.0
"
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame, Series
import random
from tqdm import tqdm
import os
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
import cv2
from keras.utils import to_categorical
import glob
from matplotlib import pyplot as plt
import cv2
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, Flatten, MaxPool2D
from keras.optimizers import adam
from keras import regularizers
from keras.utils import plot_model
from keras.applications.vgg19 import VGG19
from keras.layers import Input, Dense, Dropout
from keras import backend as K

# Any results you write to the current directory are saved as output.",No,5,22.0
"train_path = '../input/train/*.jpg'
x_train_adres = glob.glob(train_path)

m_train = len(x_train_adres)
y_train = np.zeros((m_train,1))
for i,ca in enumerate(x_train_adres):
    if 'cat' in ca:
        y_train[i] = 1
print(y_train.shape)
  
# print(y_train)
# print(x_train_adres[m_train-1])
",Yes,2,58.0
"trainingFeatures2 = trainingFeatures.drop(['Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin','Id'], axis=1)",No,5,10.0
"test = testingData
test = pd.DataFrame(test)
test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
test2 = test.drop(['Id','Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin'], axis=1)
my_predictions = model.predict(test2)
Identifier = test.Id.astype(int)
my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions})
my_submission.to_csv('my_submission.csv', index=False)
my_submission.head()",Yes,2,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"import numpy as np
import pandas as pd
train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")",Yes,4,45.0
"X_train = train.iloc[:, 2:-1].values
Y_train = train.iloc[:, 21].values
X_test = test.iloc[:, 2:].values",No,5,21.0
"X_all = np.concatenate((X_train, X_test), axis=0)
",No,5,11.0
"from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
imputer = imputer.fit(X_all)
X_all = imputer.transform(X_all)",No,5,17.0
"from sklearn.preprocessing import QuantileTransformer
sc = QuantileTransformer()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)",No,4,8.0
"from sklearn.kernel_approximation import AdditiveChi2Sampler
sc = AdditiveChi2Sampler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)",No,3,8.0
"from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators = 60 , learning_rate = 0.3)
clf.fit(X, Y)

Y_pred = clf.predict(X)
from sklearn.metrics import accuracy_score
accuracy_score(Y, Y_pred)",Yes,3,4.0
Y_test_pred = clf.predict(X_test),No,5,48.0
"cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': Y_test_pred }<br>submission = pd.DataFrame(cols)<br>print(submission)<br><br>submission.to_csv(""submission.csv"", index=False)",Yes,4,25.0
"import numpy as np 
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier


from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import KernelCenterer

from sklearn import svm",No,5,22.0
"data = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")
submition = pd.read_csv(""../input/sample_submission.csv"")",No,5,45.0
"input_label = np.array(data.get('TARGET_5Yrs'))

data = data.drop(['PlayerID','Name','TARGET_5Yrs'] ,axis=1)
data = data.fillna(data.mean())

test = test.drop(['PlayerID','Name'] ,axis=1)
test = test.fillna(test.mean())",Yes,3,10.0
"quantile = QuantileTransformer(n_quantiles=3000)
data[np.array(data.columns[:])] = quantile.fit_transform(data[np.array(data.columns[:])])
test[np.array(test.columns[:])] = quantile.transform(test[np.array(test.columns[:])])",No,5,8.0
"gradientBoosting_clf = GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=1)
gradientBoosting_clf.fit(data,input_label)
gradientBoosting_given_labels = gradientBoosting_clf.predict(final)",Yes,3,4.0
"submition.iloc[:,1] = gradientBoosting_given_labels",No,5,14.0
"submition.to_csv(""submission_6.csv"", index=False)",No,5,25.0
"#imports
import csv
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn.preprocessing import Imputer
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
",No,5,22.0
"#reading datasets
train = pd.read_csv(""../input/trainecsv/train.csv"")
test = pd.read_csv(""../input/iust-nba-rookies/test.csv"")

# print(train.info())
#print(test.info())",No,5,45.0
"#DATA preporcessing

#standardizing
std_scale = preprocessing.StandardScaler().fit(train)
train_std = std_scale.transform(train)
test_std = std_scale.transform(test)


# #PCA
# pca_std = PCA(n_components=10).fit(train_std)
# train_stdwPCA = pca_std.transform(train_std)
# test_stdwPCA = pca_std.transform(test_std)

#normalize
train_normalized = preprocessing.normalize(train_std, norm='l2')
test_normalized = preprocessing.normalize(test_std, norm='l2')


",No,5,18.0
"# #KNN (3)
from sklearn.neighbors import KNeighborsClassifier
# #find best k for knn

# accs=[]
# ks=[]
# for k in range (1,50):
#     Tknn=KNeighborsClassifier(n_neighbors=k)
#     acc=cross_val_score(Tknn, train_normalized, train_labels, cv=10, scoring='accuracy')
#     accs.append(acc.mean())
#     ks.append(k)

# print('Best K value in KNN with Max Accuracy is :',(accs.index(max(accs))+1))
# print('Best Accuracy : ', max(accs))

# best_k = accs.index(max(accs))+1


#use best K for knn
knn=KNeighborsClassifier(n_neighbors=2).fit(train_normalized,train_labels)
#acc2=cross_val_score(knn, train_normalized, train_labels, cv=10, scoring='accuracy')

trainpred=knn.predict(train_normalized)
#testpred=knn.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))

# print(acc2)
# print(np.mean(acc2))

#results.append(knn.predict(test_normalized))
# print(knn.get_params().keys())

",Yes,4,7.0
"#MLP (4)
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(25, 10), random_state=1).fit(train_normalized,train_labels)

trainpred=MLP.predict(train_normalized)
#testpred=MLP.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))
#acc3=cross_val_score(MLP, train_normalized, train_labels, cv=10, scoring='accuracy')

# print(acc3)
# print(np.mean(acc3))

#results.append(MLP.predict(test_normalized))
# print(MLP.get_params().keys())
",Yes,3,7.0
"#GradientBoosting (7)
from sklearn.ensemble import GradientBoostingClassifier

GBC_clf = GradientBoostingClassifier(n_estimators=2000, learning_rate=0.008, max_depth=1, random_state=1).fit(train_normalized, train_labels)

trainpred=GBC_clf.predict(train_normalized)
#testpred=GBC_clf.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))

# GBC_acc=cross_val_score(GBC_clf,train_normalized,train_labels,cv=20,scoring='accuracy')

# print(GBC_acc)
# print(np.mean(GBC_acc))",Yes,3,7.0
"#randomForest (8)
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(n_estimators=100).fit(train_normalized,train_labels)
#acc_random_forest = cross_val_score(random_forest_clf, train, train_labels, cv=10, scoring='accuracy')

trainpred=random_forest_clf.predict(train_normalized)
#testpred=random_forest_clf.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))
# print(acc_random_forest)
# print(np.mean(acc_random_forest))

#results.append(random_forest_clf.predict(test_normalized))",Yes,3,7.0
"#DecisionTree (9)
from sklearn.tree import DecisionTreeClassifier

DT_clf = DecisionTreeClassifier(max_depth=15, min_samples_split=3,random_state=6)
DT_clf.fit(train_normalized,train_labels)

trainpred=DT_clf.predict(train_normalized)
#testpred=DT_clf.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))

# DT_acc = cross_val_score(DT_clf, train_normalized, train_labels, cv=20, scoring='accuracy')

# print(DT_acc)
# print(np.mean(DT_acc))
",Yes,3,7.0
"# ExtraTreesClassifier (10)
from sklearn.ensemble import ExtraTreesClassifier

ET_clf = ExtraTreesClassifier(n_estimators=30, max_depth=12,min_samples_split=3, random_state=0)
ET_clf.fit(train_normalized,train_labels)

trainpred=ET_clf.predict(train_normalized)
#testpred=ET_clf.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))
# ET_acc = cross_val_score(ET_clf, train_normalized, train_labels, cv=20, scoring='accuracy')

# print(ET_acc)
# print(np.mean(ET_acc))
",Yes,3,7.0
"#AdaBoost Classifier (12)
from sklearn.ensemble import AdaBoostClassifier

AdB_clf = AdaBoostClassifier(n_estimators=450)

AdB_clf.fit(train_normalized,train_labels)

trainpred=AdB_clf.predict(train_normalized)
#testpred=AdB_clf.predict(test_normalized)

#print(metrics.accuracy_score(train_labels, trainpred))
# AdB_acc = cross_val_score(AdB_clf, train_normalized, train_labels, cv=20, scoring='accuracy')

# print(AdB_acc)
# print(np.mean(AdB_acc))",Yes,3,7.0
"#voting <br>from sklearn.ensemble import VotingClassifier<br># from sklearn.model_selection import GridSearchCV<br><br>ens_clf=VotingClassifier(estimators=[(\'kn\', knn), (\'ml\', MLP),(\'gbc\', GBC_clf), (\'rf\', random_forest_clf), <br>                                     (\'dt\', DT_clf), (\'et\', ET_clf), (\'adb\', AdB_clf)],<br>                                      voting=\'soft\', weights=[1, 2, 3, 5, 5, 5, 4])<br><br><br># grid = GridSearchCV(estimator=ens_clf,  cv=5)<br><br>ens_clf.fit(train_normalized,train_labels)<br><br>#ens_acc = cross_val_score(ens_clf, train_normalized, train_labels, cv=10, scoring=\'accuracy\')<br><br><br>trainpredEns=ens_clf.predict(train_normalized)<br>#print(metrics.accuracy_score(train_labels, trainpredEns))<br><br># print (ens_acc)<br># print(np.mean(ens_acc))<br><br>print(""ENSDone"")",Yes,3,7.0
"#predicting results<br><br>result=ens_clf.predict(test_normalized)<br><br>cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': result }<br>submission = pd.DataFrame(cols)<br><br><br>submission.to_csv(""submission.csv"", index=False)<br><br>print(submission.info())<br>print (submission)<br>print(""done"")",Yes,3,48.0
"from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from statsmodels.compat import pandas as pd
import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, preprocessing

from sklearn.decomposition import PCA",No,5,22.0
"test = pd.read_csv(""../input/test.csv"")<br>test_x = test.iloc[:, 2:].values<br>imp = Imputer(missing_values = \'NaN\', strategy = \'mean\', axis = 0)<br>imp=imp.fit(test_x)<br>test_x = imp.transform(test_x)<br>dataset = pd.read_csv(""../input/train.csv"")<br>print(dataset.info())<br>X = dataset.iloc[:, 2:-1].values<br>imputer = Imputer(missing_values = \'NaN\', strategy = \'mean\', axis = 0)<br>input_dim = X.shape[1]<br>imputer = imputer.fit(X)<br>X = imputer.transform(X)<br>y = dataset.iloc[:, 21].values<br><br>#Standard<br>sc = StandardScaler()<br>X_train = sc.fit_transform(X)<br>x_test = sc.transform(test_x)<br>#//////////////////////<br><br>#quantile_transformer = preprocessing.QuantileTransformer(random_state=0)<br>#Xprime = quantile_transformer.fit_transform(X)<br>#txprime = quantile_transformer.transform(test_x)<br>#///////////////<br>Xtrain = preprocessing.normalize(X_train, norm=\'l2\')<br>Xtest = preprocessing.normalize(x_test, norm=\'l2\')<br>#/////////////<br># from sklearn.preprocessing import PolynomialFeatures<br># poly = PolynomialFeatures(2)<br># Xprime = poly.fit_transform(X)<br># tsprime = poly.transform(test_x)<br>#/////////////////<br># pca = PCA(n_components=2)<br># Xprime=pca.fit_transform(Xprime)<br># tsprime = pca.fit_transform(test_x)<br># print(Xprime.shape)<br># print(tsprime)<br>#////////////////////<br><br>print(X)<br>print(y)<br>print(test_x)",Yes,3,17.0
"#Classifiers<br>#Choosing best one<br>#/////////////////////////<br># model = Sequential()<br># model.add(Dense(20, input_dim=input_dim))<br># model.add(Activation(\'relu\'))<br># model.add(Dropout(0.15))<br># model.add(Dense(10))<br># model.add(Activation(\'relu\'))<br># model.add(Dropout(0.25))<br># model.add(Dense(1))<br># model.add(Activation(\'softmax\'))<br>#///////////////<br>#SVM-RBF<br> #from sklearn.svm import SVC<br> #classifier = SVC(kernel = \'rbf\')<br> #classifier.fit(X, y)<br>#<br> #y_predsvm = classifier.predict(test_x)<br>#SVC//////////////////<br><br> #clf = SVC()<br> #clf.fit(X,y)<br> #y_predsvc= clf.predict(test_x)<br># preds = model.predict_classes(test_x, verbose=0)<br>#////////////////////<br>#Gaussian Naive Bayes<br>from sklearn.naive_bayes import GaussianNB<br>gnb = GaussianNB()<br>gnb.fit(X, y)<br>#print(""gnb:"", gnb.score(X,y))<br>#y_predgnb = gnb.predict(test_x)<br>#<br># model.compile(optimizer=\'rmsprop\', loss=\'mae\')<br>#<br>#<br># model.fit(X, y, epochs=10)<br>#//////////////////////////<br>#KNN<br>knn = KNeighborsClassifier(n_neighbors = 5, metric = \'minkowski\', p = 2)<br>knn.fit(X, y)<br>#y_predknn = knn.predict(test_x)<br>#print(""knn:"",knn.score(X,y))<br>#/////////////////<br>#Random Forest<br>from sklearn.ensemble import RandomForestClassifier<br>rf= RandomForestClassifier(n_estimators = 100, criterion = \'entropy\')<br>rf.fit(X,y)<br>#y_predrf = rf.predict(test_x)<br>#print(""rf:"",rf.score(X,y))<br>#/////////////////////<br>#Gradient Boosting<br>from sklearn.ensemble import GradientBoostingClassifier<br>gb = GradientBoostingClassifier(random_state=0, learning_rate=0.01)<br>gb.fit(X, y)<br>#y_predgb = gb.predict(test_x)<br>#print(""gb:"", gb.score(X,y))<br>#////////////////<br># Logistic Regression<br>from sklearn.linear_model import LogisticRegression<br>lr = LogisticRegression()<br>lr.fit(X, y)<br>#y_predlr = lr.predict(test_x)<br>#print(""lr:"",lr.score(X,y))<br>#y_pred = gb.predict(test_x)<br>#y_pred = classifier.predict(test_x)<br>#//////////QDA<br>qda= QuadraticDiscriminantAnalysis()<br>qda.fit(X,y)<br>#y_predqda = qda.predict(test_x)<br>#///////<br>#SVM<br>#svm = svm.SVC(kernel=\'linear\', C = 1.0)<br>#svm.fit(X,y)<br>#y_predsvm = svm.predict(test_x)<br>#print(""svm"",svm.score(X,y))<br># print(clf.predict(test_x))<br><br>#/////////////<br>#Adaboost<br>adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm=""SAMME"",n_estimators=200)<br><br>adb.fit(X,y)<br>#y_predadb = adb.predict(test_x)<br>#print(""adb:"",adb.score(X,y))<br>#///////////<br># Voting Classifier(LR, RF,AdaBoost,SVM,GNB,Knn,GBC)<br><br>clf1 = LogisticRegression(random_state=100)<br>clf2 = RandomForestClassifier(n_estimators = 100, criterion = \'entropy\')<br>clf3 = SVC(gamma=2, C=1)<br>clf4 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)<br>#clf3 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)<br>clf5 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),<br>                         algorithm=""SAMME"",<br>                         n_estimators=200)<br>clf6 = GaussianNB()<br>clf7 = KNeighborsClassifier(n_neighbors = 5, metric = \'minkowski\', p = 2)<br>#clf8 = svm<br>#clf9 = QuadraticDiscriminantAnalysis()<br># Majority Vote<br>eclf1 = VotingClassifier(estimators=[(\'lr\', clf1), (\'rf\', clf2), (\'svm\', clf3), (\'gbc\', clf4),(\'adb\',clf5),(\'gnb\',clf6),(\'knn\',clf7)], voting=\'hard\')<br>eclf1 = eclf1.fit(X, y)<br>preds= eclf1.predict(test_x)<br><br>print(preds)<br>",Yes,3,7.0
"cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': [eclf1.predict([test_x[i]])[0] for i in range(440)] }<br>submission = pd.DataFrame(cols)<br>print(submission)<br>submission.to_csv(""submission1.csv"", index=False)",Yes,4,25.0
"import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

from sklearn import svm

import statistics

import matplotlib.pyplot as plt


from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
",No,5,22.0
"t_data = pd.read_csv(""../input/train.csv"")
ts_data = pd.read_csv(""../input/test.csv"")",No,5,45.0
"ddata = t_data.drop([""PlayerID"",""Name""], axis=1)
sdata = ts_data.drop([""PlayerID"",""Name""], axis=1)",No,5,10.0
"ddata = ddata.interpolate()
ddata = ddata.replace([np.inf], np.float64.max)
ddata = ddata.replace([-np.inf], np.float64.min)

features = ddata.loc[:, ddata.columns.values[:len(ddata.columns.values)-1]].values
labels = ddata.loc[:, ['TARGET_5Yrs']].values

st_features = preprocessing.StandardScaler().fit_transform(features)

sdata = sdata.interpolate()
sdata = sdata.replace([np.inf], np.float64.max)
sdata = sdata.replace([-np.inf], np.float64.min)

sfeatures = sdata.loc[:, sdata.columns.values].values

st_sfeatures = preprocessing.StandardScaler().fit_transform(sfeatures)",Yes,3,17.0
"pca = PCA(n_components=10)<br><br>pca.fit(features)<br>principalComponents = pca.transform(features)<br>test_principalComponenta = pca.transform(sfeatures)<br>print(principalComponents.shape, ""\<br>"", test_principalComponenta.shape)",Yes,4,8.0
"names = [""Nearest Neighbors"", ""Linear SVM"", ""RBF SVM"", ""Gaussian Process"",
         ""Decision Tree"", ""Random Forest"", ""Neural Net"", ""AdaBoost"",
         ""Naive Bayes"", ""QDA""]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel=""linear"", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

res = []
res2 = []
for name, clf in zip(names, classifiers):
    clf.fit(inputf, labels)
    res.append(clf.predict(testf))
    temp = clf.predict(inputf)
    res2.append(temp)
    print(name, "" : "", sklearn.metrics.accuracy_score(labels, temp))

# res.append(res1)",Yes,3,4.0
"cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': res3 }<br>submission = pd.DataFrame(cols)<br>submission.to_csv(""submission.csv"", index=False)<br>print(submission)",Yes,4,25.0
"import numpy as np<br>import pandas as pd<br><br>from sklearn.model_selection import train_test_split<br>from sklearn.preprocessing import StandardScaler<br>from sklearn.preprocessing import QuantileTransformer<br>from sklearn.utils import shuffle<br>from scipy.stats import mode<br>from sklearn.metrics import f1_score<br>from sklearn.model_selection import cross_val_score<br><br>from sklearn.neural_network import MLPClassifier<br>from sklearn.neighbors import KNeighborsClassifier<br>from sklearn.svm import SVC<br>from sklearn.gaussian_process import GaussianProcessClassifier<br>from sklearn.gaussian_process.kernels import RBF<br>from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier<br>from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier<br># from sklearn.ensemble import IsolationForest<br>from sklearn.ensemble import IsolationForest<br>from sklearn.naive_bayes import GaussianNB<br># from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis<br>from sklearn.linear_model import LogisticRegression<br>from sklearn.ensemble import BaggingClassifier<br>from sklearn.ensemble import VotingClassifier<br><br># =========================================================<br><br>names = [""Nearest Neighbors"", ""Linear SVM"", ""RBF SVM"",<br>         ""Decision Tree"", ""Random Forest"", ""Neural Net"", ""AdaBoost"",<br>         ""Naive Bayes"", ""Logistic Regression"", ""Bagging"", ""GradBoost"", ""ExtraTree""]#, ""Gaussian Process""]<br><br>classifiers = [<br>    KNeighborsClassifier(n_neighbors=5),<br>    SVC(kernel=""linear"", C = 0.6), #C=0.025<br>    SVC(C = 0.6),<br>    DecisionTreeClassifier(),<br>    RandomForestClassifier(n_estimators=200, max_depth=2),<br>    MLPClassifier(alpha=1),<br>    AdaBoostClassifier(n_estimators=200),<br>    GaussianNB(),<br>    LogisticRegression(random_state=1),<br>    BaggingClassifier(n_estimators=200),<br>    GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=2),<br>    ExtraTreeClassifier()]#,<br>    #GaussianProcessClassifier(1.0 * RBF(1.0))]<br><br># =========================================================<br><br># Read DataSet and put in X and y<br>dataset = pd.read_csv(\'../input/train.csv\')<br>dataset_test = pd.read_csv(""../input/test.csv"")<br>to_drop = [\'PlayerID\', \'Name\']<br>dataset.drop(to_drop, inplace=True, axis=1)<br>dataset_test.drop(to_drop, inplace=True, axis=1)<br><br>#dataset = dataset.interpolate(method=\'values\')<br>dataset = dataset.fillna(dataset.mean())<br>dataset_test = dataset_test.fillna(dataset_test.mean())<br><br>dataset = shuffle(dataset)<br><br>#--------------------------------------<br><br>X = dataset.iloc[:, 0:-1].values<br>X[:, 8] = 1<br>y = dataset.iloc[:, 19].values<br><br>X_value = dataset_test.iloc[:, 0:].values<br>X_value[:, 8] = 1<br># =========================================================<br><br># #preprocess dataset, split into training and test part<br># sc = StandardScaler()<br># X = sc.fit_transform(X)<br># X_value = sc.fit_transform(X_value)<br>X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)<br><br># =========================================================<br><br># iterate over classifiers<br>y_pred_matrix = []<br>for name, clf in zip(names, classifiers):<br>    clf.fit(X_train, y_train)<br>    #score = clf.score(X_test, y_test)<br>    #print(name, score)<br>    y_pred = clf.predict(X_test)<br>    y_pred[y_pred==-1] = 0<br>    y_pred_matrix.append(y_pred)<br>    f1_scr = f1_score(y_test, y_pred, average=\'binary\')<br>    print(name, f1_scr)<br>    <br>y_pred_matrix = np.array(y_pred_matrix)<br>final_pred = mode(y_pred_matrix, axis=0)[0]<br>final_pred = final_pred.flatten()<br>final_score = f1_score(y_test, final_pred, average=\'binary\')<br>print (final_score)<br><br># =========================================================<br><br>#Voting by all classifiers<br>ZipList = list(zip(names, classifiers))<br>clf_Vot = VotingClassifier(estimators = ZipList, voting=\'hard\')<br>clf_Vot.fit(X_train, y_train)<br>#score = clf_Vot.score(X_test, y_test)<br>#print(score)<br>y_pred = clf_Vot.predict(X_test)<br>f1_scr = f1_score(y_test, y_pred, average=\'binary\')<br>print(f1_scr)<br><br># =========================================================<br><br>clf_Vot.fit(X, y)<br>y_pre = clf_Vot.predict(X_value)<br>#print(y_pre)<br><br>cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }<br>result = pd.DataFrame(cols)<br>result.to_csv(""Sub_Vot.csv"", index=False)<br>#result<br><br># =========================================================<br><br>clf_GB = GradientBoostingClassifier(n_estimators=350, learning_rate=.1, max_depth=1)<br>clf_GB.fit(X, y)<br>y_pre = clf_GB.predict(X_value)<br>#print(y_pre)<br><br>cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }<br>result = pd.DataFrame(cols)<br>result.to_csv(""Sub_GB.csv"", index=False)<br>#resultclf_GB = GradientBoostingClassifier(n_estimators=350, learning_rate=.1, max_depth=1)<br>clf_GB.fit(X, y)<br>y_pre = clf_GB.predict(X_value)<br>#print(y_pre)<br><br>cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }<br>result = pd.DataFrame(cols)<br>result.to_csv(""Sub_GB.csv"", index=False)<br>#result<br><br>print(""Complete Runing!"")",No,2,22.0
"import numpy as np<br>import pandas as pd<br>import sklearn as sk<br>from sklearn import svm<br>from sklearn import ensemble<br>from sklearn.ensemble import AdaBoostClassifier<br>from sklearn.ensemble import RandomForestClassifier<br>from sklearn.ensemble import ExtraTreesClassifier<br>from sklearn.tree import DecisionTreeClassifier<br>from sklearn.naive_bayes import GaussianNB<br>from sklearn import linear_model<br>from sklearn.linear_model import SGDClassifier<br>from sklearn.preprocessing import StandardScaler<br>from sklearn.neural_network import MLPClassifier<br><br># PlayerID,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs<br><br>train = pd.read_csv(""../input/train.csv"")<br>test = pd.read_csv(""../input/test.csv"")<br>X = train.values[:,2:20]<br>y = train.values[:,21]<br>y = y.astype(\'int\')<br>print (X)<br>print (y)<br><br>test_X = test.values[:,2:20]<br># test_y = test.values[:,21]<br>print (test_X)<br># print (test_y)<br>",No,3,45.0
"# testing various classifiers to choose the best accuracy.<br><br># linear SVM<br># clf = svm.SVC()<br># clf.fit(X, y)<br>#<br># SVM with sigmoid kernel<br># clf = svm.SVC(kernel=\'sigmoid\')<br># clf.fit(X, y)<br>#<br># SVM with rbf kernel<br># clf = svm.SVC(kernel=\'rbf\')<br># clf.fit(X, y)<br>#<br># SVM with poly kernel<br># clf = svm.SVC(kernel=\'poly\')<br># clf.fit(X, y)<br>#<br># adaboost <br># clf = AdaBoostClassifier(n_estimators = 350)<br># clf.fit(X, y)<br># <br># random forest<br># clf = RandomForestClassifier(n_estimators = 250)<br># clf.fit(X, y)<br># <br># decision tree<br># clf = DecisionTreeClassifier()<br># clf.fit(X, y)<br>#<br># extra tree<br># clf = ExtraTreesClassifier()<br># clf.fit(X, y)<br>#<br># gaussian naive bayes<br># clf = GaussianNB()<br># clf.fit(X, y)<br>#<br># logistic regression<br># clf = linear_model.LogisticRegression()<br># clf.fit(X, y)<br># <br># stochastic gradient descent<br># clf = SGDClassifier(loss=""squared_loss"", penalty=""l2"")<br># clf = SGDClassifier(loss=""hinge"", penalty=""l2"")<br># clf.fit(X, y)<br># <br># multi layer perceptron<br># scaler = StandardScaler()<br># scaler.fit(X)<br># X = scaler.transform(X)<br># test_X = scaler.transform(test_X)<br># print (scaler)<br># print (X)<br># print (test_X)<br># clf = MLPClassifier(solver=\'lbfgs\', alpha=1e-4, hidden_layer_sizes=(10,5), warm_start=\'True\')<br># clf.fit(X, y)<br># <br># Gradient boosting<br>params = {\'n_estimators\': 2000, \'learning_rate\': 0.008}<br>clf = ensemble.GradientBoostingClassifier(**params)<br>clf.fit(X, y)<br><br>cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': [clf.predict([test_X[i]])[0] for i in range(440)] }<br>submission = pd.DataFrame(cols)<br>print(submission)<br>submission.to_csv(""submission.csv"", index=False)",Yes,3,7.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import calendar<br>from datetime import datetime<br>import matplotlib.pyplot as plt<br>import seaborn as sn<br>from scipy import stats<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"train=pd.read_csv('../input/train.csv')
train.info()",Yes,4,45.0
"train[\'date\']=train.datetime.apply(lambda x: x.split()[0])<br>train[\'hour\']=train.datetime.apply(lambda x:x.split()[1].split(\':\')[0])<br>train[\'weekday\'] = train.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])<br>train[\'month\']=train.date.apply(lambda dateString: calendar.month_name[datetime.strptime(dateString,\'%Y-%m-%d\').month])<br>train[\'season\']=train.season.map({1:\'Spring\',2:\'Summer\',3:\'Fall\',4:\'Winter\'})<br>train[\'weather\']=train.weather.map({1: "" Clear + Few clouds + Partly cloudy + Partly cloudy"",\\<br>                                        2 : "" Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist "", \\<br>                                        3 : "" Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds"", \\<br>                                        4 :"" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "" })",No,5,8.0
"category_vars=['hour','weekday','month','season','weather','holiday','workingday']
for var in category_vars:
    train[var]=train[var].astype('category')
train.info()",Yes,4,16.0
"train=train.drop('datetime',axis=1)",No,5,10.0
"#model.(valdn_x, valdn_y, batch_size=32, verbose=1)
predictions = model.predict(test_x, batch_size=32, verbose=1)",No,5,48.0
"with open(\'submission.csv\',\'w\') as f:<br>    f.write(\'id,label\<br>\')<br>    for index in range(len(test_imgs)):<br>        img_id =basename(test_imgs[index]).split(""."")[0]<br>        prob = (predictions[index,0])<br>        #print(""index: {}, img_id: {}, prob:{}"".format(index,img_id, prob))<br>        f.write(""{},{}\<br>"".format(img_id, prob))",Yes,3,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br># Any results you write to the current directory are saved as output.",Yes,3,22.0
"from keras.layers import Conv2D,Dense,MaxPooling2D,BatchNormalization,Activation,Flatten
from keras import Sequential
from keras.initializers import glorot_normal
from keras import optimizers
from keras.models import Model
from keras.applications.imagenet_utils import decode_predictions",No,5,22.0
"#################################TRYING OUT THE RESNET 50 ARCHITECTURE################################################
import random
l=[1,2,3]
",Yes,3,22.0
"from keras.applications import VGG19
v=VGG19(weights=""imagenet"",include_top=False,input_shape=(120,120,3))",Yes,3,22.0
"#########defining the new model by defining my own last layer
new_model=v.output
new_model=Flatten()(new_model)
new_model=Dense(10)(new_model)
new_model=Activation(""relu"")(new_model)
new_model=Dense(1,activation=""sigmoid"")(new_model)

final_model=Model(input=v.input,output=new_model)",No,5,4.0
"###freezin all layers except from last 3 layers
total_layers=len(final_model.layers)
print(total_layers)
for x in range(0,total_layers-4):
    final_model.layers[x].trainable=False
    
     
# final_model.layers",No,5,84.0
"###train test split
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100)
",No,5,13.0
"####compiling the model
o=optimizers.adam()
final_model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o)",Yes,4,59.0
"final_model.fit(train_x,train_y,batch_size=32,epochs=15,validation_split=0.2)",No,5,7.0
"preds=final_model.predict(test_x) 
new_preds=[]
for x in preds:
    if x >0.5:
        new_preds.append(1)
    else:
        new_preds.append(0)
new_preds=np.array(new_preds)
new_preds=new_preds.reshape(len(new_preds),1)      ",No,5,48.0
"sum(new_preds==test_y)/len(test_y)
train_x=[]
test_x=[]
data=[]
labels=[]",No,5,77.0
"########importing test file
data=[]
input_file_names=[]

#####get the file names of the images to read them one by one
for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/test/""):
    input_file_names=filenames

for x in input_file_names:
    img_file_name=x##getting name of the image file
    path=str(""../input/dogs-vs-cats-redux-kernels-edition/test/""+img_file_name)####making proper path of the image file
    i=image.load_img(path)####reading the image from the path 
    i=i.resize((120,120))#####resizing the image 
    iarray=image.img_to_array(i)####converting it to arrau
    data.append(iarray)#####appending the image to the list",No,5,84.0
"data=np.array(data)/255.
preds=final_model.predict(data)",No,5,27.0
"df=pd.DataFrame({'id':new_input_test_file_names,
             'label':preds})",No,5,12.0
"df.to_csv(""submission2.csv"",index=False)",No,5,25.0
"#import libraries
import pandas as pd
import numpy as np
import os, random ,cv2
import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten, Dropout, Activation, MaxPool2D
from keras.optimizers import Adam, RMSprop
from keras.losses import binary_crossentropy
from keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt


",No,5,22.0
"#specify train and test datasets paths
train_path = '../input/train/'
test_path = '../input/test/'
#define image dimensions 
rows = 150
cols = 150
channels = 3",No,5,77.0
"#create a list of train image paths ""including image name""<br>train_images = [train_path+i for i in os.listdir(train_path)]<br>train_dogs = [train_path+i for i in os.listdir(train_path) if \'dog\' in i]<br>train_cats = [train_path+i for i in os.listdir(train_path) if \'cat\' in i]",No,5,77.0
"#create a list of test image paths ""including image name""
test_images = [test_path+i for i in os.listdir(test_path)]",No,5,77.0
"train_images = train_dogs[:3000] + train_cats[:3000]
#randomly shuffle train images
random.shuffle(train_images)",Yes,3,21.0
"def prep_data(image_path_list):
    x=[]
    y=[]
    for i in image_path_list:
        x.append(cv2.resize(plt.imread(i), #read then resize image 
        (rows,cols), interpolation=cv2.INTER_CUBIC))  #appened new image to x
    for j in image_path_list: #create a label and append it to y 
        if 'dog' in j:
            y.append(1)
        elif 'cat' in j:
            y.append(0)
    return x,y        ",No,5,21.0
"X, y = prep_data(train_images)",No,5,21.0
"#split X,y into a train and validation data sets
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=(1/3), random_state=1)",No,5,13.0
"X_test, y_test = prep_data(test_images)",No,5,21.0
"#create a keras CNN model from sctarch
model = Sequential()

model.add(Conv2D(32,(3,3), input_shape=(rows, cols, 3)))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(64,(3,3)))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(128,(3,3)))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(256,(3,3)))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Flatten())

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))",No,5,4.0
"model.compile(optimizer='RMSprop', metrics=['accuracy'], loss='binary_crossentropy')",No,5,84.0
"#create a data generator object with some image augmentation specs
datagen = ImageDataGenerator(
    rescale=1./ 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
",No,5,84.0
"#create an iterator for both train and valid sets
train_gen = datagen.flow(x=np.array(X_train), y=y_train, batch_size=50)
valid_gen = datagen.flow(x=np.array(X_val), y=y_val, batch_size=50)",No,5,77.0
"#train/validate model
model.fit_generator(train_gen, steps_per_epoch=40, epochs=50, verbose=1, validation_data=valid_gen, validation_steps=20)",No,5,7.0
"#create a data generator object for testing
datagen = ImageDataGenerator(rescale = 1./255)",No,5,84.0
"test_gen = datagen.flow(np.array(X_test), batch_size = 100)",No,5,84.0
"#predict
predictions = model.predict_generator(test_gen, steps=125, verbose=1)",No,5,48.0
"#submit
id_num = range(1, len(predictions_dogs) + 1)
submission = pd.DataFrame({""id"": id_num, ""label"":predictions_dogs})
submission.to_csv(""submission.csv"", index = False)",No,5,25.0
"import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import math<br>import IPython.display as ipd<br>import keras<br>from keras.models import *<br>from keras.callbacks import *<br>from keras.layers import *<br>from keras.preprocessing.image import random_brightness,random_rotation,random_shear,random_shift,random_zoom<br>import sklearn<br>from sklearn.preprocessing import LabelEncoder<br>from sklearn.metrics import classification_report<br>from sklearn.metrics import confusion_matrix<br>from sklearn.metrics import log_loss<br>from sklearn.model_selection import train_test_split<br>import skimage<br>import skimage.transform<br>import skimage.color<br>import imageio<br>import matplotlib.pyplot as plt<br>import seaborn as sns<br>import os<br><br>TRAIN_DIR = ""../input/train/""<br>TEST_DIR = ""../input/test/""<br><br>train = pd.DataFrame()<br>train[\'file\'] = os.listdir(TRAIN_DIR)<br>train[\'class\'] = train[\'file\'].apply(lambda x: x.split(\'.\')[0])<br>train[\'class_id\'] = train[\'class\'].apply(lambda x: 0 if x==\'cat\' else 1)<br>test = pd.DataFrame()<br>test[\'file\'] = os.listdir(TEST_DIR)<br>test[\'id\'] = test[\'file\'].apply(lambda x: x.split(\'.\')[0])<br>test[\'label\'] = 0.5<br><br>train.head()",No,3,22.0
"sns.countplot(x='class', data=train);",No,5,33.0
"def make_model(size=(256,256)):
    def make_cnn(kernel_nums, x):
        for n in kernel_nums:
            x = Conv2D(n, kernel_size=3, strides=1, activation='relu', padding='same')(x)
            x = MaxPooling2D(pool_size=2, strides=2, padding='same')(x)
            x = BatchNormalization()(x)
            x = SpatialDropout2D(0.3)(x)
        return Flatten()(x)
    inp = Input((size[0],size[1],1))
    kernel_nums = [64, 64,128,128,256,256,512,512]
    scaled = inp
    cnn_outs = []
    for i in range(6):
        scaled = AveragePooling2D(pool_size=2**i, strides=2**i)(inp)
        cnn_outs.append(make_cnn(kernel_nums[:len(kernel_nums)-i], scaled))
    x = concatenate(cnn_outs)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    out = Dense(1, activation='sigmoid')(x)
    return Model(inp, out)",No,5,4.0
"SIZE = (256,256)
model = make_model(size=SIZE)
model.summary()
#keras.utils.plot_model(model, to_file='model.png', show_shapes=True)
#ipd.Image(filename='model.png')",No,5,4.0
"model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
batch_size=150
X_train, X_valid, y_train, y_valid = train_test_split(train['file'].values, train['class_id'].values, test_size=0.1)
model.fit_generator(image_generator(X_train, TRAIN_DIR, labels=y_train, size=SIZE, batch_size=batch_size, random_preproc=False, rotation_range=10, shear_range=5, shift_range=(0.1,0.1), zoom_range=(0.8,1.2)),
                    epochs=25,
                    steps_per_epoch=int(math.ceil(len(y_train)/batch_size)),
                    validation_data=image_generator(X_valid, TRAIN_DIR, labels=y_valid, size=SIZE, batch_size=batch_size, random_preproc=False),
                    validation_steps=int(math.ceil(len(y_valid)/batch_size)),
                    callbacks=[EarlyStopping(monitor='val_loss',patience=3,verbose=0)],
                    verbose=1
                   )",Yes,4,7.0
"predicted_probs = model.predict_generator(image_generator(X_valid, TRAIN_DIR, size=SIZE, batch_size=batch_size, random_preproc=False),
                                             steps=int(math.ceil(len(y_valid)/batch_size))
                                            )
predicted = np.round(predicted_probs)
print(classification_report(y_valid, predicted))
print(log_loss(y_valid, predicted_probs))
sns.heatmap(confusion_matrix(y_valid, predicted), annot=True);",Yes,3,48.0
"predicted_probs = model.predict_generator(image_generator(test['file'], TEST_DIR, size=SIZE, batch_size=batch_size, random_preproc=False),
                                             steps=int(math.ceil(len(test['file'])/batch_size))
                                            )
test['label'] = predicted_probs
test[['id','label']].to_csv('submission.csv', index=False)",Yes,3,48.0
"PATH = ""../input/""
TMP_PATH = ""/tmp/tmp""
MODEL_PATH = ""/tmp/model/""
sz=224",No,5,77.0
"fnames = np.array([f'train/{f}' for f in sorted(os.listdir(f'{PATH}train'))])
labels = np.array([(0 if 'cat' in fname else 1) for fname in fnames])",No,5,77.0
"from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *",No,5,22.0
"arch=resnet50
",No,5,4.0
"data = ImageClassifierData.from_names_and_array(
    path=PATH, 
    fnames=fnames, 
    y=labels, 
    classes=['dogs', 'cats'], 
    test_name='test', 
    tfms= tfms_from_model(resnet34, sz, aug_tfms=transforms_side_on, max_zoom=1.1)   #data augmentation
)
learn = ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)",No,5,30.0
"learn.fit(0.01,4)",No,5,7.0
"learn.precompute=False
learn.fit(1e-2, 3, cycle_len=2)",No,5,7.0
"lr=np.array([1e-4,1e-3,1e-2])",No,5,5.0
"learn.fit(lr, 3, cycle_len=1, cycle_mult=2)",No,5,7.0
"log_predictions,y = learn.TTA(is_test=True)
prob_predictions = np.mean(np.exp(log_predictions),0)
probs = prob_predictions[:,1]",No,5,8.0
log_predictions.shape,No,5,58.0
ids= fnames = np.array([f'{f}' for f in os.listdir(f'{PATH}test')]),No,5,77.0
"ids= [i.replace("".jpg"","""") for i in ids]
ids[0]",No,5,77.0
"ans= pd.DataFrame({""id"":ids,""label"":probs})<br>ans= ans.sort_values(\'id\')<br>ans.head()",Yes,3,9.0
ans.describe(),No,5,40.0
"cm = confusion_matrix(y, valid_preds)
plot_confusion_matrix(cm, data.classes)",No,5,56.0
"ans.to_csv('submission.csv', index=False)",No,5,25.0
"from sklearn import pipeline,ensemble,preprocessing, feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
#from sklearn import tree
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LogisticRegression",No,5,22.0
train = pd.read_json('../input/train.json'),No,5,44.0
%matplotlib inline,No,4,23.0
"import matplotlib.pyplot as plt
plt.style.use('ggplot')",Yes,3,22.0
train['cuisine'].value_counts().plot(kind='bar'),No,5,33.0
"top10= pd.DataFrame([[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
                   index=[cuisine for cuisine in counters],
                   columns=['top{}'.format(i) for i in range(1,11)])
top10",No,4,12.0
"indices=train.ingredients.str.contains('garlic cloves')
train[indices]['cuisine'].value_counts().plot(kind='bar',
                                             title= 'Dientes de ajo hallados por cocina')",No,5,33.0
"unique= np.unique(top10.values.ravel())
unique
fig, axes= plt.subplots(8,8, figsize=(20,20))
for ingredient, ax_index in zip(unique, range(64)):
    indices=train.ingredients.str.contains(ingredient)
    relative_freq= (train[indices]['cuisine'].value_counts()/train['cuisine'].value_counts())
    relative_freq.plot(kind='bar', ax=axes.ravel()[ax_index], fontsize=8, title=ingredient)",No,5,33.0
train.isnull().sum(),No,5,39.0
"fig,axes=plt.subplots(nrows=2,ncols=2)
sn.boxplot(data=train,y='count',orient='v',ax=axes[0][0])
sn.boxplot(data=train,y='count',x='season',orient='v',ax=axes[0][1])
sn.boxplot(data=train,y='count',x='hour',orient='v',ax=axes[1][0])
sn.boxplot(data=train,y='count',x='workingday',orient='v',ax=axes[1][1])",No,5,33.0
trainwo=train[np.abs(train['count']-train['count'].mean())<=3*train['count'].std()],No,5,14.0
"print('Shape of the DataFrame with outliers: ', train.shape)
print('Shape of the DataFrame without outliers: ', trainwo.shape)",No,5,58.0
"corr=train[[""temp"",""atemp"",""casual"",""registered"",""humidity"",""windspeed"",""count""]].corr()
mask=np.array(corr)
mask[np.tril_indices_from(mask)]=False
fig,ax=plt.subplots()
sn.heatmap(corr,mask=mask,vmax=.8,square=True,annot=True)",Yes,4,80.0
"clf=pipeline.Pipeline([
        ('tfidf_vectorizer', feature_extraction.text.TfidfVectorizer(lowercase=True)),
        ('clf', LinearSVC(random_state=0))
    ])",Yes,3,4.0
"# step 1: testing
X_train,X_test,y_train,y_test=train_test_split(train.ingredients,train.cuisine, test_size=0.2)",No,5,13.0
"clf.fit(X_train, y_train)",No,5,7.0
y_pred = clf.predict(X_test),No,5,48.0
"confusion_matrix(y_test, y_pred)",No,5,49.0
"accuracy_score(y_test, y_pred)",No,5,49.0
"# step 2: real training
test=pd.read_json('../input/test.json')",No,5,44.0
test.ingredients=test.ingredients.apply(' '.join),No,5,78.0
"clf.fit(train.ingredients,train.cuisine)",No,5,7.0
pred=clf.predict(test.ingredients),No,5,48.0
"df=pd.DataFrame({'id':test.id,'cuisine':pred})",No,5,12.0
"df.to_csv('LinearSVC.csv', columns=['id','cuisine'],index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir("".""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = 'retina'",No,5,23.0
"df_store = pd.read_csv('../input/store.csv')
df = pd.read_csv('../input/train.csv', low_memory=False)

df = df.merge(df_store, on='Store')",Yes,4,45.0
"df_test = pd.read_csv('../input/test.csv', low_memory=False)
df_test.head()",Yes,4,45.0
df.head(5),No,5,41.0
"df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df.Date.apply(lambda dt: dt.month)
df['Year'] = df.Date.apply(lambda dt: dt.year)
df['WeekOfYear'] = df.Date.apply(lambda dt: dt.weekofyear)
df['Day'] = df.Date.apply(lambda dt: dt.day)

df['isMonthEnd'] = df.Date.apply(lambda dt: dt.is_month_end)
df['isMonthStart'] = df.Date.apply(lambda dt: dt.is_month_start)
df['isQuarterEnd'] = df.Date.apply(lambda dt: dt.is_quarter_end )
df['isQuarterStart'] = df.Date.apply(lambda dt: dt.is_quarter_start)
df['isYearEnd'] = df.Date.apply(lambda dt: dt.is_year_end)
df['isYearStart'] = df.Date.apply(lambda dt: dt.is_year_start)",No,5,8.0
"features = []
for feat in df.columns.drop('Sales'):
    if df[feat].dtype == np.float64 or df[feat].dtype == np.int64:
        features.append(feat)",No,5,77.0
"fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20, 20));
df_sample = df.sample(frac=0.05)

for idx, feature in enumerate(features):
    df_sample.plot(feature, ""Sales"", subplots=True, kind=""scatter"", ax=axes[idx // 4, idx % 4]);",No,5,33.0
#      Customers  Sales (   Open/Promo).     -  <br># Promo2      <br><br>df[df.columns.drop('Sales')].corrwith(df.Sales),No,5,40.0
"#   ""b""       <br><br>df.groupby(\'StoreType\')[\'Sales\'].mean()",No,5,60.0
sns.distplot(df.Sales[df.Sales > 0]),No,5,33.0
df.info(),No,5,40.0
"#     Promo2,     ~SinceWeek  ~SinceYear<br>df[(pd.isnull(df.Promo2SinceWeek) | pd.isnull(df.Promo2SinceYear)) & df.Promo2 != 0]",No,5,14.0
"df['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df['CompetitionOpenSinceYear'].fillna(0, inplace=True)",No,5,17.0
"df['Promo2SinceWeek'].fillna(0, inplace=True)
df['Promo2SinceYear'].fillna(0, inplace=True)",No,5,17.0
"df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
df['CompetitionDistance'] = np.log(df.CompetitionDistance) + 1",Yes,4,17.0
"df.sample(frac=.001).plot(\'CompetitionDistance\', ""Sales"", subplots=True, kind=""scatter"")",No,5,33.0
"#      ,          <br>df.groupby('Store')['CompetitionDistance'].unique().apply(lambda l: 1 if len(l) > 1 else 0).sum()",No,5,60.0
"#     ,          <br>df['StateHoliday'] = df['StateHoliday'].replace(0, '0')<br>df['Holiday'] = df.StateHoliday.apply(lambda x: 0 if x == '0' else 1)<br><br>df.drop('StateHoliday', axis=1, inplace=True)",Yes,4,8.0
"df = df.sort_values(by='Date')
df.drop('Date', axis=1, inplace=True)",Yes,3,9.0
"df = df[(df['Open'] != 0) & (df['Sales'] != 0)]
df.drop('Open', axis=1, inplace=True)",Yes,3,14.0
#      <br><br>df.PromoInterval.value_counts(),No,5,72.0
"df['isMonthEnd'] = df['isMonthEnd'].astype(int)
df['isMonthStart'] = df['isMonthStart'].astype(int)
df['isQuarterEnd'] = df['isQuarterEnd'].astype(int)
df['isQuarterStart'] = df['isQuarterStart'].astype(int)
df['isYearEnd'] = df['isYearEnd'].astype(int)
df['isYearStart'] = df['isYearStart'].astype(int)",No,5,16.0
"# competition open time (in months)<br>df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + \\<br>        (df.Month - df.CompetitionOpenSinceMonth)<br>    <br># Promo open time<br>df['PromoOpen'] = 12 * (df.Year - df.Promo2SinceYear) + \\<br>        (df.WeekOfYear - df.Promo2SinceWeek) / 4.0<br><br>df = pd.get_dummies(df, columns=['DayOfWeek', 'StoreType', 'Assortment','PromoInterval'], dummy_na=True)",Yes,4,8.0
"import xgboost as xgb<br>from hyperopt import hp<br>from hyperopt import fmin, tpe, hp, STATUS_OK, Trials<br><br>def train(index, train, hp_selection=False):<br>    train_store = train[index]<br>    X = train_store[train_store.columns.drop([\'Sales\', \'Store\', \'Customers\'])]<br>    y = train_store[\'Sales\']<br><br>    train_size = int(X.shape[0]*.99)<br>    print(f\'Regressor for {index} store\<br>Training on {X.shape[0]} samples\')<br>    X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]<br>    X_test, y_test = X.iloc[train_size:], y.iloc[train_size:]<br><br>    xtrain = xgb.DMatrix(X_train, np.log(y_train.values) + 1)<br>    xtest = xgb.DMatrix(X_test, np.log(y_test.values) + 1)<br>    <br>    if hp_selection:<br>        def score(params):<br>            num_round = 200<br>            model = xgb.train(params, xtrain, num_round, feval=rmspe_xg)<br>            predictions = model.predict(xtest)<br>            score = rmspe(y=y_test, yhat=predictions)<br>            return {\'loss\': score, \'status\': STATUS_OK}<br><br>        def optimize(trials):<br>            space = {<br>                     \'n_estimators\' : hp.quniform(\'n_estimators\', 1, 1000, 1),<br>                     \'eta\' : hp.quniform(\'eta\', 0.2, 0.825, 0.025),<br>                     \'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),<br>                     \'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),<br>                     \'subsample\' : hp.quniform(\'subsample\', 0.7, 1, 0.05),<br>                     \'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),<br>                     \'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),<br>                     \'eval_metric\': \'rmse\',<br>                     \'objective\': \'reg:linear\',<br>                     \'nthread\': 4,<br>                     \'silent\' : 1<br>                     }<br><br>            best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)<br>            return best<br>        <br>        trials = Trials()<br>        best_opts = optimize(trials)<br>        best_opts[\'silent\'] = 1<br>    else:<br>        best_opts = {\'colsample_bytree\': 0.7, <br>                  \'eta\': 0.625, <br>                  \'gamma\': 0.8, <br>                  \'max_depth\': 6,<br>                  \'eval_metric\': \'rmse\',<br>                  \'min_child_weight\': 6.0, <br>                  \'n_estimators\': 8.0,  # 585<br>                  \'silent\': 1,<br>                  \'nthread\': 4,<br>                  \'subsample\': 0.95}<br>        <br>    watchlist = [(xtrain, \'train\'), (xtest, \'eval\')]<br>    num_round = 10000<br>    regressor = xgb.train(best_opts, xtrain, num_round, watchlist, feval=rmspe_xg,<br>                          verbose_eval=10, early_stopping_rounds=50)<br>    print(""Validating"")<br>    train_probs = regressor.predict(xtest)<br>    indices = train_probs < 0<br>    train_probs[indices] = 0<br>    error = rmspe(np.exp(train_probs) - 1, y_test.values)<br>    print(\'error\', error)<br>    regressor = xgb.train(best_opts, xtest, 10, feval=rmspe_xg, xgb_model=regressor)<br>    return regressor",Yes,2,2.0
"df_test = pd.read_csv(\'../input/test.csv\', low_memory=False)<br>closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values<br><br>df_test = df_test.merge(df_store, on=\'Store\')<br>df_test[\'Date\'] = pd.to_datetime(df_test[\'Date\'])<br>df_test[\'Month\'] = df_test.Date.apply(lambda dt: dt.month)<br>df_test[\'Year\'] = df_test.Date.apply(lambda dt: dt.year)<br>df_test[\'WeekOfYear\'] = df_test.Date.apply(lambda dt: dt.weekofyear)<br>df_test[\'Day\'] = df_test.Date.apply(lambda dt: dt.day)<br><br>df_test[\'isMonthEnd\'] = df_test.Date.apply(lambda dt: dt.is_month_end).astype(int)<br>df_test[\'isMonthStart\'] = df_test.Date.apply(lambda dt: dt.is_month_start).astype(int)<br>df_test[\'isQuarterEnd\'] = df_test.Date.apply(lambda dt: dt.is_quarter_end ).astype(int)<br>df_test[\'isQuarterStart\'] = df_test.Date.apply(lambda dt: dt.is_quarter_start).astype(int)<br>df_test[\'isYearEnd\'] = df_test.Date.apply(lambda dt: dt.is_year_end).astype(int)<br>df_test[\'isYearStart\'] = df_test.Date.apply(lambda dt: dt.is_year_start).astype(int)<br><br>df_test[\'CompetitionOpenSinceMonth\'].fillna(0, inplace=True)<br>df_test[\'CompetitionOpenSinceYear\'].fillna(0, inplace=True)<br><br>df_test[\'Promo2SinceWeek\'].fillna(0, inplace=True)<br>df_test[\'Promo2SinceYear\'].fillna(0, inplace=True)<br><br>df_test[\'CompetitionDistance\'].fillna(df_test[\'CompetitionDistance\'].median(), inplace=True)<br><br>df_test[\'StateHoliday\'] = df_test[\'StateHoliday\'].replace(0, \'0\')<br>df_test[\'Holiday\'] = df_test.StateHoliday.apply(lambda x: 0 if x == \'0\' else 1)<br><br>df_test.drop(\'StateHoliday\', axis=1, inplace=True)<br>df_test.drop(\'Date\', axis=1, inplace=True)<br><br># competition open time (in months)<br>df_test[\'CompetitionOpen\'] = 12 * (df_test.Year - df_test.CompetitionOpenSinceYear) + \\<br>        (df_test.Month - df_test.CompetitionOpenSinceMonth)<br>    <br># Promo open time<br>df_test[\'PromoOpen\'] = 12 * (df_test.Year - df_test.Promo2SinceYear) + \\<br>        (df_test.WeekOfYear - df_test.Promo2SinceWeek) / 4.0<br><br>df_test.drop([\'Open\'], axis=1, inplace=True)<br><br>df_test = pd.get_dummies(df_test, columns=[\'DayOfWeek\', \'StoreType\', \'Assortment\',\'PromoInterval\'], dummy_na=True)<br>",Yes,4,8.0
"fig,(ax1,ax2,ax3)=plt.subplots(ncols=3)
sn.regplot(x='temp',y='count',data=train,ax=ax1)
sn.regplot(x='windspeed',y='count',data=train,ax=ax2)
sn.regplot(x='humidity',y='count',data=train,ax=ax3)",No,5,33.0
"train=pd.read_csv(\'../input/train.csv\')<br>test=pd.read_csv(""../input/test.csv"")",No,5,45.0
"data=train.append(test)
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)
",Yes,4,10.0
"data[\'date\']=data[\'datetime\'].apply(lambda x:x.split()[0])<br>data[\'hour\']=data[\'datetime\'].apply(lambda x:x.split()[1].split(\':\')[0])<br>data[\'year\']=data[\'date\'].apply(lambda x:x.split(\'-\')[0])<br>data[\'month\']=data[\'date\'].apply(lambda x:datetime.strptime(x,""%Y-%m-%d"").month)<br>data[\'weekday\']=data[\'date\'].apply(lambda x:datetime.strptime(x,""%Y-%m-%d"").weekday())",No,5,8.0
"from sklearn.ensemble import RandomForestRegressor
wind0=data[data['windspeed']==0]
windNot0=data[data['windspeed']!=0]
rf_wind=RandomForestRegressor()
wind_cols=['season','weather','year','month','temp','atemp','humidity']
rf_wind.fit(windNot0[wind_cols],windNot0['windspeed'])
pred=rf_wind.predict(X=wind0[wind_cols])
wind0['windspeed']=pred
data=windNot0.append(wind0)
data.reset_index(inplace=True)
data.drop('index',axis=1,inplace=True)
",Yes,2,7.0
"categorical_features=['season','month','year','workingday','holiday','weather','hour']
numerical_features=['humidity','windspeed','temp','atemp']
drop_features=['casual','registered','datetime','date','count']
for var in categorical_features:
    data[var]=data[var].astype('category')",No,5,16.0
"train=data[pd.notnull(data['count'])].sort_values(by=['datetime'])
test=data[~pd.notnull(data['count'])].sort_values(by='datetime')
datetimecol=test['datetime']
y_train=train['count']
y_train_registered=train['registered']
y_train_casual=train['casual']

train=train.drop(drop_features,axis=1)
test=test.drop(drop_features,axis=1)",No,5,21.0
"from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings(""ignore"", category=DeprecationWarning)

lr=LinearRegression()
y_train_log=np.log1p(y_train)
lr.fit(train,y_train_log)
pred=lr.predict(train)
print(""RMSLE Value For Linear Regression: "",rmsle(y_train_log,pred,True))
",Yes,2,22.0
df_submission,No,5,41.0
"df_submission.to_csv('submission.csv', index=False)",No,5,25.0
"from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=100)
y_train_log=np.log1p(y_train)
rf.fit(train,y_train_log)
pred=rf.predict(train)
print(""RMSLE Value For Random Forest: "", rmsle(y_train_log,pred,True))",Yes,3,7.0
"# def score(params):<br>#     print(""Training with params : "")<br>#     print(params)<br>#     num_round = int(params[\'n_estimators\'])<br>#     model = xgb.train(params, xtrain, num_round, feval=rmspe_xg)<br>#     predictions = model.predict(xtest)<br>#     score = rmspe(y=y_test, yhat=predictions)<br>#     br = \'-\'*124<br>#     print(f\'{br}\<br>\\tScore of RMSPE: {score}\<br>{br}\')<br>#     return {\'loss\': score, \'status\': STATUS_OK}<br><br># def optimize(trials):<br>#     space = {<br>#              \'n_estimators\' : hp.quniform(\'n_estimators\', 1, 1000, 1),<br>#              \'eta\' : hp.quniform(\'eta\', 0.3, 0.825, 0.025),<br>#              \'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),<br>#              \'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),<br>#              \'subsample\' : hp.quniform(\'subsample\', 0.7, 1, 0.05),<br>#              \'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),<br>#              \'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),<br>#              \'eval_metric\': \'rmse\',<br>#              \'objective\': \'reg:linear\',<br>#              \'nthread\': 4,<br>#              \'silent\' : 1<br>#              }<br><br>#     best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)<br><br>#     print(best)<br>#     return best<br><br>    <br># trials = Trials()<br># best_opts = optimize(trials)<br>",No,5,53.0
"# def score(params):<br>#     print(""Training with params : "")<br>#     print(params)<br>#     num_round = 25  # int(params[\'n_estimators\'])<br>#     # del params[\'n_estimators\']<br>#     dtrain = xgb.DMatrix(X_train, label=y_train)<br>#     dvalid = xgb.DMatrix(X_test, label=y_test)<br>#     model = xgb.train(params, dtrain, num_round)<br>#     predictions = model.predict(dvalid)<br>#     score = mae(y_test, predictions)<br>#     br = \'-\'*130<br>#     print(f\'{br}\<br>\\tScore of MAE: {score}\<br>{br}\')<br>#     return {\'loss\': score, \'status\': STATUS_OK}<br><br># def optimize(trials):<br>#     space = {<br>#              \'n_estimators\' : hp.quniform(\'n_estimators\', 100, 1000, 1),<br>#              \'eta\' : hp.quniform(\'eta\', 0.4, 0.825, 0.025),<br>#              \'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),<br>#              \'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),<br>#              \'subsample\' : hp.quniform(\'subsample\', 0.5, 1, 0.05),<br>#              \'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),<br>#              \'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),<br>#              \'eval_metric\': \'mae\',<br>#              \'objective\': \'reg:linear\',<br>#              \'nthread\': 4,<br>#              \'silent\' : 1<br>#              }<br><br>#     best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=50)<br><br>#     print(best)<br>    <br># trials = Trials()<br># optimize(trials)",No,5,53.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import lightgbm as lgbm<br>import matplotlib.pyplot as plt<br>import seaborn as sns<br>from sklearn.model_selection import KFold, StratifiedKFold<br>from sklearn.metrics import log_loss, mean_squared_error<br>from sklearn.model_selection import train_test_split<br>from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier<br>from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor<br>from sklearn.svm import SVC<br>from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder<br><br>from keras.models import Sequential<br>from keras.layers import Dense<br>from keras.layers import Dense, Dropout, BatchNormalization, Activation <br>from keras.callbacks import EarlyStopping, ModelCheckpoint<br>from keras import optimizers<br><br>import warnings<br>warnings.filterwarnings(\'ignore\')<br><br>import os<br>print(os.listdir(""../input""))<br><br>import regex as re<br>import gc<br># Any results you write to the current directory are saved as output.",No,4,88.0
"baseline_tree_score = 0.23092278864723115
baseline_neuralnetwork_score = 0.5480561937041435",No,5,77.0
"train = pd.read_csv('../input/kaggletutorial/covertype_train.csv')
test = pd.read_csv('../input/kaggletutorial/covertype_test.csv')",No,5,45.0
train_index = train.shape[0],No,5,77.0
"lgbm_param =  {<br>    \'boosting_type\': \'gbdt\',<br>    \'objective\': \'binary\',<br>    \'metric\': \'binary_logloss\',<br>    ""learning_rate"": 0.06,<br>    ""num_leaves"": 16,<br>    ""max_depth"": 6,<br>    ""colsample_bytree"": 0.7,<br>    ""subsample"": 0.8,<br>    ""reg_alpha"": 0.1,<br>    ""reg_lambda"": 0.1,<br>    ""nthread"":8<br>}",No,5,59.0
"def keras_model(input_dims):
    model = Sequential()
    
    model.add(Dense(input_dims, input_dim=input_dims))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(input_dims//2))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    
    # output layer (y_pred)
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    # compile this model
    model.compile(loss='binary_crossentropy', # one may use 'mean_absolute_error' as alternative
                  optimizer='adam', metrics=['accuracy'])
    return model

def keras_history_plot(history):
    plt.plot(history.history['loss'], 'y', label='train loss')
    plt.plot(history.history['val_loss'], 'r', label='val loss')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(loc='upper right')
    plt.show()",Yes,4,4.0
"from sklearn.ensemble import GradientBoostingRegressor
gbr=GradientBoostingRegressor(n_estimators=4000,alpha=0.01)
y_train_log=np.log1p(y_train)
gbr.fit(train,y_train_log)
pred=gbr.predict(train)
print(""RMSLE Value For Gradient Boost: "", rmsle(y_train_log,pred,True))",Yes,3,7.0
"lgbm_param =  {<br>    \'boosting_type\': \'gbdt\',<br>    \'objective\': \'binary\',<br>    \'metric\': \'binary_logloss\',<br>    ""learning_rate"": 0.03,<br>    ""num_leaves"": 24,<br>    ""max_depth"": 6,<br>    ""colsample_bytree"": 0.65,<br>    ""subsample"": 0.7,<br>    ""reg_alpha"": 0.1,<br>    ""reg_lambda"": 0.2,<br>    ""nthread"":8<br>}",No,5,59.0
"pred_test=gbr.predict(test)
fig,(ax1,ax2)=plt.subplots(ncols=2)
sn.distplot(y_train,ax=ax1,bins=50)
sn.distplot(np.exp(pred_test),ax=ax2,bins=50)",Yes,3,48.0
"submission=pd.DataFrame({
    'datetime':datetimecol,
    'count':[max(0,x) for x in np.exp(pred_test)]
})
submission.to_csv('bike_predictions_gbm.csv',index=False)",Yes,4,25.0
"import os
print((os.listdir('../input/')))",No,5,88.0
"import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score",No,5,22.0
"df_train = pd.read_csv('../input/web-club-recruitment-2018/train.csv')
df_test = pd.read_csv('../input/web-club-recruitment-2018/test.csv')
feature_cols=['X1','X2','X3','X5','X6','X7','X8','X9','X10','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23']",No,5,45.0
"dtrain = lgbm.Dataset(train_df, label=y_value)
clf = lgbm.train(lgbm_param, train_set=dtrain, num_boost_round=5000)
predict = clf.predict(test_df)",Yes,3,7.0
"submission = pd.read_csv('../input/kaggletutorial/sample_submission.csv')
submission['Cover_Type'] = predict
submission.to_csv('lgbm_last.csv', index=False)",No,3,25.0
"def keras_model(input_dims):
    model = Sequential()
    
    model.add(Dense(input_dims, input_dim=input_dims))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(input_dims))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(input_dims//2))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(input_dims//5))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    
    # output layer (y_pred)
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    # compile this model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', metrics=['accuracy'])
    return model

",No,5,84.0
"y_value = train_df['Cover_Type']
del train_df['Cover_Type'], train_df['ID']
del test_df['Cover_Type'], test_df['ID']

model = keras_model(train_df.shape[1])
callbacks = [
        EarlyStopping(
            patience=10,
            verbose=10)
    ]",No,4,4.0
"""""""  CV  .<br>NFOLD = 5<br>folds = StratifiedKFold(n_splits= NFOLD, shuffle=True, random_state=2018)<br><br>total_score = 0<br>best_epoch = 0<br>for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y_value)):<br>    train_x, train_y = train_df.iloc[train_idx], y_value.iloc[train_idx]<br>    valid_x, valid_y = train_df.iloc[valid_idx], y_value.iloc[valid_idx]<br><br>    history = model.fit(train_x.values, train_y.values, nb_epoch=30, batch_size = 64, validation_data=(valid_x.values, valid_y.values), <br>                        verbose=1, callbacks=callbacks)<br><br>    keras_history_plot(history)<br>    predict = model.predict(valid_x.values)<br>    null_count = np.sum(pd.isnull(predict) )<br>    if null_count > 0:<br>        print(""Null Prediction Error: "", null_count)<br>        predict[pd.isnull(predict)] = predict[~pd.isnull(predict)].mean()<br><br>    cv_score = log_loss(valid_y, predict )<br>    total_score += cv_score<br>    best_epoch = max(best_epoch, np.max(history.epoch))<br>    print(\'Fold {} LogLoss : {}\'.format(n_fold + 1, cv_score ))<br><br>print(""Best Epoch: "", best_epoch)<br>print(""Total LogLoss"", total_score/NFOLD)<br>print(""Baseline model Score Diff"", total_score/NFOLD - baseline_neuralnetwork_score)<br>""""""",Yes,3,7.0
"history = model.fit(train_df.values, y_value.values, nb_epoch=30, batch_size = 64, verbose=1)
predict = model.predict(test_df.values)",Yes,3,7.0
"submission_nn = pd.read_csv('../input/kaggletutorial/sample_submission.csv')
submission_nn['Cover_Type'] = predict
submission_nn.to_csv('nn_last.csv', index=False)",No,4,25.0
"source = submission.copy()
source = source.merge(submission_nn,on='ID')
source",No,5,32.0
"lgbm_param1 =  {<br>    \'boosting_type\': \'dart\',<br>    \'objective\': \'binary\',<br>    \'metric\': \'binary_logloss\',<br>    ""learning_rate"": 0.03,<br>    ""num_leaves"": 31,<br>    ""max_depth"": 7,<br>    ""colsample_bytree"": 0.8,<br>    ""subsample"": 0.8,<br>    ""reg_alpha"": 0.1,<br>    ""reg_lambda"": 0.1,<br>    ""nthread"":8,<br>    \'drop_rate\':0.1, <br>    \'skip_drop\':0.5,<br>    \'max_drop\':50, <br>    \'top_rate\':0.1, <br>    \'other_rate\':0.1<br>}<br><br>lgbm_param2 =  {<br>    \'boosting_type\': \'gbdt\',<br>    \'objective\': \'binary\',<br>    \'metric\': \'binary_logloss\',<br>    ""learning_rate"": 0.03,<br>    ""num_leaves"": 10,<br>    ""max_depth"": 4,<br>    ""colsample_bytree"": 0.5,<br>    ""subsample"": 0.8,<br>    ""reg_alpha"": 0.1,<br>    ""reg_lambda"": 0.1,<br>    ""nthread"":8<br>}<br><br>lgbm_param3 =  {<br>    \'boosting_type\': \'gbdt\',<br>    \'objective\': \'binary\',<br>    \'metric\': \'binary_logloss\',<br>    ""learning_rate"": 0.03,<br>    ""num_leaves"": 24,<br>    ""max_depth"": 6,<br>    ""colsample_bytree"": 0.5,<br>    ""subsample"": 0.8,<br>    ""reg_alpha"": 0.1,<br>    ""reg_lambda"": 0.1,<br>    ""nthread"":8<br>}<br><br>rf_params = {<br>    \'criterion\':\'gini\', \'max_leaf_nodes\':24, \'n_estimators\':200, \'min_impurity_split\':0.0000001,<br>    \'max_features\':0.4, \'max_depth\':6, \'min_samples_leaf\':20, \'min_samples_split\':2,<br>    \'min_weight_fraction_leaf\':0.0, \'bootstrap\':True,<br>    \'random_state\':1, \'verbose\':False<br>    <br>}<br><br>et_parmas = {<br>    \'criterion\':\'gini\', \'max_leaf_nodes\':31, \'n_estimators\':200, \'min_impurity_split\':0.0000001,<br>    \'max_features\':0.6, \'max_depth\':10, \'min_samples_leaf\':20, \'min_samples_split\':2,<br>    \'min_weight_fraction_leaf\':0.0, \'bootstrap\':True,<br>    \'random_state\':1, \'verbose\':False <br>}",No,5,59.0
"et_model = SklearnWrapper(clf = ExtraTreesClassifier, params=et_parmas)
rf_model = SklearnWrapper(clf = RandomForestClassifier, params=rf_params)",No,5,4.0
"x_train = pd.DataFrame(x_train_second_layer)
x_test = pd.DataFrame(x_test_second_layer)",No,5,12.0
"submission_stacking = pd.read_csv('../input/kaggletutorial/sample_submission.csv')
submission_stacking['Cover_Type'] = predict_stacking
submission_stacking.to_csv('submission_stacking.csv', index=False)",No,4,25.0
"submission_et = pd.read_csv('../input/kaggletutorial/sample_submission.csv')
submission_et['Cover_Type'] = et_test
submission_et.to_csv('submission_et.csv', index=False)",No,4,25.0
"import cv2                 # working with, mainly resizing, images<br>import numpy as np         # dealing with arrays<br>import os                  # dealing with directories<br>from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.<br>from tqdm import tqdm      # a nice pretty percentage bar for tasks. Thanks to viewer Daniel Bhler for this suggestion<br><br>TRAIN_DIR = '../input/train'<br>TEST_DIR = '../input/test'<br>IMG_SIZE = 50<br>LR = 1e-3<br><br>MODEL_NAME = 'dogsvscats-{}-{}.model'.format(LR, '2conv-basic') # just so we remember which saved model is which, sizes must match",Yes,4,22.0
"train_X = df_train[feature_cols]
train_y = df_train.loc[:, 'Y']
df_test = df_test[feature_cols]",No,5,21.0
"rf = RandomForestClassifier(n_estimators=200,max_features='auto',max_depth=23)",No,5,4.0
"train_data = create_train_data()
# If you have already created the dataset:
#train_data = np.load('train_data.npy')",No,5,53.0
"rf.fit(train_X, train_y)",No,5,7.0
"import tflearn
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression

convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input')

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = fully_connected(convnet, 1024, activation='relu')
convnet = dropout(convnet, 0.8)

convnet = fully_connected(convnet, 2, activation='softmax')
convnet = regression(convnet, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets')

model = tflearn.DNN(convnet, tensorboard_dir='log')",Yes,4,4.0
"
pred = rf.predict_proba(df_test)",No,5,48.0
"if os.path.exists('{}.meta'.format(MODEL_NAME)):
    model.load(MODEL_NAME)
    print('model loaded!')",No,5,30.0
"result = pd.DataFrame(pred[:,1])
result.index.name = 'id'
result.columns = ['predicted_val']
result.to_csv('output.csv', index=True)",Yes,4,25.0
"train = train_data[:-500]
test = train_data[-500:]",No,4,77.0
"import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
",No,5,22.0
"df_train = pd.read_csv('../input/web-club-recruitment-2018/train.csv')
df_test = pd.read_csv('../input/web-club-recruitment-2018/test.csv')
",No,5,45.0
"X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
Y = [i[1] for i in train]

test_x = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
test_y = [i[1] for i in test]",No,5,21.0
"model.fit({'input': X}, {'targets': Y}, n_epoch=2, validation_set=({'input': test_x}, {'targets': test_y}), 
    snapshot_step=50000, show_metric=True, run_id=MODEL_NAME)",No,5,7.0
"X = df_train.loc[:, 'X1':'X23']

y = df_train.loc[:, 'Y']
",No,5,14.0
"import tensorflow as tf
tf.reset_default_graph()",No,5,23.0
"rf.fit(X, y)


",No,5,7.0
"convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input')

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = fully_connected(convnet, 1024, activation='relu')
convnet = dropout(convnet, 0.8)

convnet = fully_connected(convnet, 2, activation='softmax')
convnet = regression(convnet, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets')

model = tflearn.DNN(convnet, tensorboard_dir='log')


if os.path.exists('{}.meta'.format(MODEL_NAME)):
    model.load(MODEL_NAME)
    print('model loaded!')

train = train_data[:-500]
test = train_data[-500:]

X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
Y = [i[1] for i in train]

test_x = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
test_y = [i[1] for i in test]

model.fit({'input': X}, {'targets': Y}, n_epoch=4, validation_set=({'input': test_x}, {'targets': test_y}), 
    snapshot_step=500, show_metric=True, run_id=MODEL_NAME)",No,2,4.0
model.save(MODEL_NAME),No,5,50.0
"test = df_test.loc[:, 'X1':'X23']

pred = rf.predict_proba(test)",Yes,4,48.0
"with open('submission_file.csv','w') as f:<br>    f.write('id,label\<br>')<br>            <br>with open('submission_file.csv','a') as f:<br>    for data in tqdm(test_data):<br>        img_num = data[1]<br>        img_data = data[0]<br>        orig = img_data<br>        data = img_data.reshape(IMG_SIZE,IMG_SIZE,1)<br>        model_out = model.predict([data])[0]<br>        f.write('{},{}\<br>'.format(img_num,model_out[1]))",No,5,25.0
"import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os
%matplotlib inline
print(os.listdir(""../input/dataset-adult/""))",No,5,88.0
"#Adult Data
adult = pd.read_csv(""../input/dataset-adult/train_data.csv"",sep="","", na_values=""?"")",No,5,45.0
adult.shape,No,5,58.0
"import pandas as pd 
import sklearn",No,5,22.0
adult.head(3),No,5,41.0
"import os
print(os.listdir('../input'))",No,5,88.0
adult.info(),No,5,40.0
"adult = pd.read_csv(""../input/dataadult/train_data.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
adult.describe(),No,5,40.0
adult.head(),No,5,41.0
adult['native.country'].value_counts(),No,5,72.0
"adult['race'].value_counts().plot(kind=""pie"")",No,5,33.0
"#distribuio de idade<br>adult[""age""].plot(kind=\'hist\',bins=15);",No,5,33.0
"#agrupando atributo \'income\' e \'sex\' para cada idade<br>df=adult.groupby([""income"",""sex""]).mean()<br>df[\'age\'].plot(kind=""bar"")",No,5,33.0
nadult = adult.dropna(),No,5,17.0
"#Proporo de sexo por \'income\'<br>df2=adult.groupby([""income"",""sex""]).size().unstack().plot(kind=\'bar\',stacked=False)",No,5,33.0
nadult,No,5,41.0
"#proporo de sexo por ocupao!<br>df2=adult.groupby([""occupation"",""sex""])[\'race\'].size().unstack().plot(kind=\'barh\',stacked=True)",No,5,33.0
"#drop colunas empty e index ""Id""
na_adult=adult.set_index(""Id"").dropna()",No,5,17.0
"test_adult= pd.read_csv(""../input/dataset-adult/test_data.csv"",sep="","",na_values=""?"")",No,5,45.0
Yadult = nadult.income,No,4,77.0
from sklearn.neighbors import KNeighborsClassifier,No,5,22.0
knn = KNeighborsClassifier(n_neighbors=5),No,5,4.0
from sklearn.model_selection import cross_val_score,No,5,22.0
"#armazena todos os dados de treino e de teste (numericos e categoricos)
X_adult = na_adult.iloc[:,:-1]
Y_adult = na_adult.income",No,5,21.0
"X_test = test_adult.iloc[:,:]",No,5,21.0
"#treino e teste apenas de dados numericos adult
num_cols=[""age"",""education.num"",""capital.gain"",""capital.loss"",""hours.per.week""]

X_num=X_adult[num_cols]
Y_num=Y_adult
X_test= X_test[num_cols]",No,5,21.0
"knn.fit(Xadult, Yadult)",No,5,7.0
"testadult = pd.read_csv(""../input/dataadult/test_data.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"#importacao de bibliotecas de ML
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder",No,5,22.0
YtestPred = knn.predict(Xtestadult),No,5,48.0
"arr1= testadult.iloc[:,0].values<br>arr1 = arr1.ravel()<br>dataset = pd.DataFrame({\'Id\':arr1[:],\'income\':YtestPred[:]})<br>dataset.to_csv(""Adultscompetition.csv"", index = False)",Yes,3,25.0
"import pandas as pd
import sklearn",No,5,22.0
"adult = pd.read_csv(""../input/adult-db/train_data.csv"",header=0, index_col=0, na_values=""?"")",No,5,45.0
print(adult.shape),No,5,58.0
"#learning e predict
knn=KNeighborsClassifier(n_neighbors=9) #instacia model
scores = cross_val_score(knn,X_num,Y_num,cv=10) #validacao cruzada
knn.fit(X_num,Y_num)
Y_testpredict=knn.predict(X_test)
scores",Yes,3,4.0
"#observando tipo de dados..
test_adult.dtypes",No,5,70.0
"#Converte 'object columns para 'str', pois object pode conter dados em outro formato:
convert_cols=['workclass','education','marital.status','occupation','race','relationship',
              'sex','native.country']
test_adult[convert_cols] = test_adult[convert_cols].astype(str)",No,5,16.0
"#testes..
test_adult.columns",No,5,71.0
"#treino e teste de dados numericos e categoricos:
Xencode_adult= na_adult.iloc[:,:-1].apply(LabelEncoder().fit_transform)
Xencode_test_adult = test_adult.apply(LabelEncoder().fit_transform)

X_adult = Xencode_adult
X_test = Xencode_test_adult",No,5,20.0
"adult[""native.country""].value_counts()",No,5,72.0
"import matplotlib.pyplot as plt
%matplotlib inline",No,5,23.0
"Yfit_adult= LabelEncoder().fit(na_adult[""income""])
Y_adult = Yfit_adult.transform(na_adult[""income""])",No,5,20.0
"adult[""age""].value_counts().plot(kind=""bar"")",No,5,33.0
"#learning e predict
knn =KNeighborsClassifier(n_neighbors=10)
scores = cross_val_score(knn,X_adult,Y_adult,cv=10)
knn.fit(X_adult,Y_adult)
scores",Yes,3,4.0
"adult[""sex""].value_counts()",No,5,72.0
"adult[""education.num""].value_counts().plot(kind=""bar"")",No,5,33.0
"Ytest_predict= knn.predict(X_test)
print(Ytest_predict)",No,5,48.0
"adult[""occupation""].value_counts().plot(kind=""bar"")",No,5,33.0
"pd.unique(adult[""relationship""])",No,5,57.0
X_adult.columns,No,5,71.0
"#escolha de atributos para melhor predict
atributos=atributos=[""age"",""workclass"",""education.num"",""occupation"",""sex"",""marital.status"",""capital.gain"",""capital.loss""]
X_adult = Xencode_adult[atributos]
X_test = Xencode_test_adult[atributos]",No,4,21.0
"#Escolhendo k=27 p kNN
knn =KNeighborsClassifier(n_neighbors=27)
knn.fit(X_adult,Y_adult)
scores",Yes,3,4.0
"from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler()
col_inds = [0,1,4,5,6,7,8,10,11,12] # 0,1 [0,1,3,4,5,6,7,8,9,10,13] 2 [0,1,3,4,5,6,7,8,9,10,11,12,13] 3 [0,1,4,5,6,7,8,10,11,12]
Xadult_unscaled = adult_fill.iloc[:,col_inds].apply(LabelEncoder().fit_transform)
Xadult = minmaxscaler.fit_transform(Xadult_unscaled)
Yadult = adult_fill.income
print(Xadult_unscaled.columns.values)",Yes,3,18.0
Ytest_predict= knn.predict(X_test),No,5,48.0
"#dados de submissao
label_out = Yfit_adult.inverse_transform(Ytest_predict)
df_out = pd.DataFrame({'Id': X_test.index,'income':label_out})
df_out.to_csv('submission_adult.csv',index=False)",No,5,25.0
"pd.read_csv(""submission_adult.csv"")",No,5,45.0
"%matplotlib inline
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np",No,5,23.0
"testAdult = pd.read_csv(""../input/adult-db/test_data.csv"",header=0, index_col=0, na_values=""?"")
testAdult.shape",Yes,4,45.0
"adult = pd.read_csv(""../input/mydata/train_data.csv"",<br>        names=[<br>        ""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",<br>        ""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",<br>        ""Hours per week"", ""Country"", ""Target""],<br>        skiprows=1,<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"adult[""Country""].value_counts()",No,5,72.0
"adult[""Age""].value_counts().plot(kind=""bar"")",No,5,33.0
"adult[""Sex""].value_counts().plot(kind=""bar"")",No,5,33.0
"adult[""Occupation""].value_counts().plot(kind=""bar"")",No,5,33.0
"XtestAdult_unscaled = testAdult_fill.iloc[:,col_inds].apply(LabelEncoder().fit_transform)
XtestAdult = minmaxscaler.transform(XtestAdult_unscaled)",Yes,3,20.0
"testAdult = pd.read_csv(""../input/mydata/test_data.csv"",<br>        names=[<br>        ""ID"",""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",<br>        ""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",<br>        ""Hours per week"", ""Country""],<br>        skiprows=1,<br>        index_col=0,<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"knn = KNeighborsClassifier(n_neighbors=34,p=1)
knn.fit(Xadult,Yadult)",No,5,7.0
testAdult.head(),No,5,41.0
"YtestAdult = knn.predict(XtestAdult)
YtestAdult",No,5,48.0
testAdult['Capital Gain'].plot(),No,5,33.0
"prediction = pd.DataFrame(testAdult.index)
prediction[""income""] = YtestAdult",Yes,4,12.0
"prediction.to_csv(""adult_prediction_5.csv"", index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import sklearn",No,5,22.0
"adult = pd.read_csv(""../input/adultdataset/train_data.csv"",<br>        names=[<br>        ""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",<br>        ""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",<br>        ""Hours per week"", ""Country"", ""Target""],<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        skiprows=1,<br>        na_values=""?"")",No,5,45.0
"nTestAdult = testAdult.dropna()
nTestAdult.shape",Yes,3,17.0
"Xadult = nadult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]]
Xadult.head()",No,4,41.0
Yadult = nadult.Target,No,5,21.0
"testAdult = pd.read_csv(""../input/adultdataset/test_data.csv"",<br>        names=[<br>        ""id"", ""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",<br>        ""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",<br>        ""Hours per week"", ""Country""],<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        skiprows=1,<br>        na_values=""?"")",No,5,45.0
from sklearn.neighbors import KNeighborsClassifier,No,5,22.0
knn = KNeighborsClassifier(n_neighbors=30),No,5,4.0
YtestPred = knn.predict(XtestAdult),No,5,48.0
"result = np.vstack((testAdult[""id""], YtestPred)).T
x = [""id"",""income""]
Resultado = pd.DataFrame(columns = x, data = result)
Resultado.to_csv(""Resultados.csv"", index = False)
Resultado",Yes,3,25.0
"import os 
os.listdir('../input/adultb')",No,5,88.0
"adult=pd.read_csv(\'../input/adultb/train_data.csv\',<br>                  sep=\',\', engine=\'python\',<br>                  na_values=""?"")",No,5,45.0
nadult = adult.copy(),No,4,77.0
adult.isnull().sum(),No,4,39.0
"Xadult = adult[['age','education.num',
                 'capital.gain','capital.loss',
                 'hours.per.week']]",No,5,21.0
"testAdult = pd.read_csv('../input/adultb/test_data.csv',
                        sep=',',engine='python',
                        na_values='?')",No,5,45.0
testAdult.isnull().sum(),No,4,39.0
"XtestAdult = testAdult[['age','education.num',
                 'capital.gain','capital.loss',
                 'hours.per.week']]",No,5,10.0
"result = np.vstack((testAdult[""Id""], YtestPred)).T
x = [""id"",""income""]
submit = pd.DataFrame(columns = x, data = result)
submit.to_csv(""Resultados.csv"", index = False)",Yes,3,25.0
"import pandas as pd
import sklearn
import numpy as np
import os
from sklearn import preprocessing",No,5,22.0
"adultOriginal = pd.read_csv(""../input/adult-db/train_data.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"adultOriginal.head()
",No,5,41.0
adultOriginal.shape,No,5,58.0
"numAdult_1=adultOriginal.fillna(method='pad')
numAdult_2=numAdult_1.fillna(method='pad')",No,5,17.0
"adult = numAdult_2.apply(preprocessing.LabelEncoder().fit_transform)
adult",No,5,20.0
"adult[""sex""].value_counts().plot(kind=""bar"")",No,5,33.0
"adult[""race""].value_counts().plot(kind=""bar"")",No,5,33.0
"adult[""education""].value_counts().plot(kind=""bar"")",No,5,33.0
"test_adult = pd.read_csv(\'../input/adult-db/test_data.csv\',<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"numAdultTest_1=test_adult.fillna(method='pad')
numAdultTest_2=numAdultTest_1.fillna(method='pad')
adultTest = numAdultTest_2.apply(preprocessing.LabelEncoder().fit_transform)
adultTest",Yes,4,17.0
"from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score",No,5,22.0
"Xadult = adult.drop(['workclass', 'marital.status', 'sex', 'occupation', 'relationship', 'income', 'capital.gain', 'capital.loss', 'native.country'], axis=1)
Xadult",No,5,10.0
"XtestAdult = adultTest.drop(['workclass', 'marital.status', 'sex', 'occupation', 'relationship', 'capital.gain', 'capital.loss', 'native.country'], axis=1)",No,5,10.0
knn = KNeighborsClassifier(n_neighbors=21),No,5,4.0
data = pd.DataFrame(adultTest.Id),No,5,12.0
"data.to_csv(""BaseAdult_KNN.csv"", index=False)",No,5,25.0
"import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np",No,5,22.0
"train = pd.read_csv(""../input/dataset/train_data.csv"",<br>                   na_values = \'?\')",No,5,45.0
train = train.dropna(),No,5,17.0
"Atrain = train[[""age"",""education.num"",""capital.gain"", ""capital.loss"", ""hours.per.week""]]
Btrain = train.income",No,5,21.0
knn = KNeighborsClassifier(n_neighbors=15),No,5,4.0
"knn.fit(Atrain,Btrain)",No,5,7.0
"test = pd.read_csv(""../input/dataset/test_data.csv"",<br>                   na_values = \'?\')",No,5,45.0
"Atest = test[[""age"",""education.num"",""capital.gain"", ""capital.loss"", ""hours.per.week""]]",No,5,21.0
Bpred=knn.predict(Atest),No,5,48.0
prediction = pd.DataFrame(index = test.index),No,5,12.0
"prediction.to_csv(""submition.csv"",index=False)",No,5,25.0
"import os, cv2, re, random
import numpy as np
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
from keras import layers, models, optimizers
from keras import backend as K
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline",No,5,23.0
"transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='.', train=True,
                                        download=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='.', train=False,
                                       download=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=32,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')",No,3,42.0
"def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(32)))",No,5,84.0
"class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
if torch.cuda.is_available():
    net.cuda()",No,5,4.0
"import torch.optim as optim

criterion = nn.CrossEntropyLoss()
if use_gpu:
    criterion = criterion.cuda()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)",No,3,28.0
"# loop over the dataset multiple times
for epoch in tqdm_notebook(range(10)):  

    running_loss = 0.0
    for i, data in tqdm_notebook(enumerate(trainloader, 0)):
        # get the inputs
        inputs, labels = data

        if torch.cuda.is_available():
            # in versions of Torch < 0.4.0 we have to wrap these into torch.autograd.Variable as well
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.data[0]
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')",No,5,7.0
"dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(32)))",No,5,84.0
"# in PyTorch 0.4.0 you won't need the Variable wrapper
outputs = net(Variable(images).cuda()) if use_gpu else net(Variable(images))",No,5,77.0
"_, predicted = torch.max(outputs.data, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(32)))",No,5,53.0
"all_pred = np.empty((0, 10), float)",No,5,53.0
"for data in tqdm_notebook(testloader):
    images, _ = data
    if use_gpu:
        images = images.cuda()
    outputs = net(Variable(images))
    curr_pred = F.softmax(outputs).data.cpu().numpy()
    all_pred = np.vstack([all_pred, curr_pred])",No,5,48.0
all_pred.shape,No,5,58.0
"pd.DataFrame(all_pred, columns=classes).to_csv('baseline.csv', index_label='id')",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br>import glob<br>from glob import glob<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"train_path = '../input/train'
path_name = train_path + '/**/*.jpg'",No,4,77.0
"train_image_paths = glob(path_name, recursive=True)",No,5,88.0
train_image_paths[:10],No,4,88.0
"train_categories = list(map(os.path.basename,train_image_paths))",No,3,88.0
train_categories[:3],No,5,53.0
"labels =[]
for category in train_categories:
    labels.append(category[:3])",No,3,21.0
labels[:10],No,5,53.0
len(labels),No,5,58.0
len(train_image_paths),No,5,53.0
"num_classes = len(np.unique(labels))
num_classes",No,5,54.0
"#Encode labels with value between 0 and n_classes-1.
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
loadedLabels = np.asarray(labels)
encoder.fit(loadedLabels)
encoded_loadedLabels = encoder.transform(loadedLabels)",No,5,20.0
"# Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
from keras.utils.np_utils import to_categorical
labels_Hot = to_categorical(encoded_loadedLabels, num_classes = num_classes)",No,5,20.0
labels_Hot[:3],No,5,53.0
df = pd.DataFrame(),No,5,12.0
df,No,5,41.0
df['path']=train_image_paths,No,4,8.0
df['path'].head(),No,5,41.0
df['labels'] = list(labels_Hot),No,5,8.0
"from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (128, 128)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)",No,4,31.0
"def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, 
                                     class_mode = 'sparse',
                                    **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = '' # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen",No,3,21.0
"from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, 
                                   test_size = 0.25, 
                                   random_state = 2018)",No,5,13.0
len(train_df),No,5,58.0
len(valid_df),No,5,58.0
"train_gen = flow_from_dataframe(core_idg, train_df, 
                             path_col = 'path',
                            y_col = 'labels', 
                            target_size = IMG_SIZE,
                            batch_size = 32)

valid_gen = flow_from_dataframe(core_idg, valid_df, 
                             path_col = 'path',
                            y_col = 'labels', 
                            target_size = IMG_SIZE,
                            batch_size = 256) # we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm
test_X, test_Y = next(flow_from_dataframe(core_idg, 
                               valid_df, 
                             path_col = 'path',
                            y_col = 'labels', 
                            target_size = IMG_SIZE,
                            batch_size = 1024)) # one big batch",No,4,21.0
"t_x, t_y = next(train_gen)",No,5,84.0
t_x.shape[1:],No,5,58.0
"from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input

import keras
from keras import backend as K
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.models import Model,Sequential, model_from_json
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D
%matplotlib inline",No,5,23.0
"pretrained_model_1 = VGG16(include_top=False, input_shape=t_x.shape[1:])
base_model = pretrained_model_1 # Topless
optimizer1 = keras.optimizers.Adam()
# Add top layer
x = base_model.output
x = Conv2D(100, kernel_size = (3,3), padding = 'valid')(x)
x = Flatten()(x)
x = Dropout(0.75)(x)
predictions = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)
# Train top layer
for layer in base_model.layers:
    layer.trainable = False
model.compile(loss='categorical_crossentropy', 
              optimizer=optimizer1, 
              metrics=['accuracy'])
model.summary()",Yes,4,30.0
"model.fit_generator(train_gen,steps_per_epoch=100,validation_data = (test_X, test_Y), 
                                  epochs = 10)",No,5,7.0
"test_image_paths = glob('../input/test/*.jpg', recursive=True)",No,5,88.0
"img_width = 150
img_height = 150
TRAIN_DIR = '../input/train/'
TEST_DIR = '../input/test/'
train_images_dogs_cats = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR)] # use this for full dataset
test_images_dogs_cats = [TEST_DIR+i for i in os.listdir(TEST_DIR)]",No,5,77.0
"train_images_dogs_cats.sort(key=natural_keys)
train_images_dogs_cats_trim = train_images_dogs_cats[0:1300]
train_images_dogs_cats_trim += train_images_dogs_cats[12500:13800] 

test_images_dogs_cats.sort(key=natural_keys)",Yes,4,9.0
"X, Y = prepare_data(train_images_dogs_cats_trim)

# First split the data in two sets, 80% for training, 20% for Val/Test)
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=1)",Yes,3,21.0
"model = models.Sequential()

model.add(layers.Conv2D(32, (3, 3), input_shape=(img_width, img_height, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(32, (3, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(64, (3, 3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))",No,5,4.0
"X_test, Y_test = prepare_data(test_images_dogs_cats)",No,5,21.0
"Y_pred = model.predict(np.array(X_val))

#####predict cat | predict dog
for i in range(0,5):
    if Y_pred[i, 0] >= 0.5: 
        print('I am {:.2%} sure this is a Dog'.format(Y_pred[i][0]))
    else: 
        print('I am {:.2%} sure this is a Cat'.format(1-Y_pred[i][0]))
        
    plt.imshow(X_val[i])    
    plt.show()",Yes,2,48.0
"XtestAdult = nTestAdult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]]
XtestAdult.head()",No,4,41.0
"# Validating the score on validation data 
score = model.evaluate_generator(validation_generator)
print('Test score:', score[0])
print('Test accuracy:', score[1])",No,5,49.0
knn = KNeighborsClassifier(n_neighbors=3),No,5,4.0
from sklearn.metrics import accuracy_score,No,5,22.0
"UCITest = pd.read_csv(""../input/mydata/adult.test"",<br>        names=[<br>        ""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",<br>        ""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",<br>        ""Hours per week"", ""Country"", ""Target""],<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
UCITest.head(),No,5,41.0
nUCITest = UCITest.dropna(),No,5,17.0
nUCITest.head(),No,5,41.0
"XUCITest = nUCITest[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]]",No,5,77.0
XUCITest.head(),No,5,41.0
"YUCIPred = knn.predict(XUCITest)
YUCIPred",No,5,48.0
"accuracy_score(YUCIPred, YUCITest)",No,5,49.0
"knn.fit(Xadult, Yadult)",No,5,7.0
YUCIPred = knn.predict(XUCITest),No,5,48.0
"accuracy_score(YUCITest, YUCIPred)",No,5,49.0
"accuracies = {}

for i in range(1, 100):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(Xadult, Yadult)
    scores = cross_val_score(knn, Xadult, Yadult, cv=10)
    Ypred = knn.predict(XUCITest)
    accuracy = accuracy_score(YUCITest,Ypred)
    accuracies[i] = accuracy
    print('k={}, accuracy={}, CVmean={}'.format(i, accuracy, scores.mean()))
    ",Yes,4,49.0
"ks = list(accuracies.keys())
acc = list(accuracies.values())
plt.plot(ks, acc)
plt.show()",No,5,33.0
knn = KNeighborsClassifier(n_neighbors=28),No,5,4.0
adult['Sex'] = adult['Sex'].transform(lambda x: 1 if x=='Male' else 0 if x==x else x),No,5,8.0
"predictions = knn.predict(testAdult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]])",No,5,48.0
"pretrained_model_1 = VGG16(include_top=False, input_shape=t_x.shape[1:])",No,5,4.0
"from keras import optimizers
base_model = pretrained_model_1 # Topless

add_model = Sequential()
add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
add_model.add(Dense(256, activation='relu'))
add_model.add(Dense(num_classes, activation='softmax'))

model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

model.summary()",Yes,4,4.0
X_test = pd.DataFrame(),No,5,12.0
X_test['path'] = test_image_paths,No,5,8.0
"result = np.vstack((testAdult.index.values, predictions)).T
x = ['Id','income']
resultado = pd.DataFrame(columns=x, data=result)
resultado.set_index('Id', inplace=True)",Yes,3,11.0
X_test['labels'] = X_test['path'].map(lambda x: os.path.splitext(os.path.basename(x))[0]),No,5,8.0
resultado.to_csv('mypredictions.csv'),No,5,25.0
"store = pd.read_csv(""../input/store.csv"")
train = pd.read_csv(""../input/train.csv"",parse_dates=[2])
test = pd.read_csv(""../input/test.csv"",parse_dates=[3])",No,5,45.0
store.head(),No,5,41.0
submission = pd.DataFrame(),No,5,12.0
"# check store nan rows
store.isnull().sum()",No,5,39.0
store.PromoInterval.value_counts(),No,5,72.0
"submission.to_csv(""predictions.csv"",index=False)",No,5,25.0
"# fillna in store with 0 has better result than median()
store.fillna(0, inplace=True)",No,5,17.0
"%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import time
import pickle
import feather
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()
# from tqdm import tqdm

# pd.options.display.max_rows = 999
# pd.options.display.max_columns = 999
import glob
def get_path(str, first=True, parent_dir='../input/**/'):
    res_li = glob.glob(parent_dir+str)
    return res_li[0] if first else res_li",No,4,88.0
train.head().append(train.tail()),No,5,41.0
train.Open.value_counts(),No,5,72.0
"DATA_DIR = '../input/dogs-vs-cats-redux-kernels-edition/'
evals = pd.read_csv('../input/dvc-prepare-evalset/evals.csv')
evals.head()",Yes,4,45.0
"H, W, C = 224, 224, 3 #at least 197
batch_size = 32
eval_batch_size = batch_size * 4",No,5,77.0
"# draw store 1 and store 10 sales distribution plot<br>import matplotlib.pyplot as plt<br>store_1 = train.loc[(train[""Store""]==1)&(train[\'Sales\']>0), [\'Date\',""Sales""]]<br>store_10 = train.loc[(train[""Store""]==10)&(train[\'Sales\']>0), [\'Date\',""Sales""]]<br>f = plt.figure(figsize=(18,10))<br>ax1 = f.add_subplot(211)<br>ax1.plot(store_1[\'Date\'], store_1[\'Sales\'], \'-\')<br>ax1.set_xlabel(\'Time\')<br>ax1.set_ylabel(\'Sales\')<br>ax1.set_title(\'Store 1 Sales Distribution\')<br><br>ax2 = f.add_subplot(212)<br>ax2.plot(store_10[\'Date\'], store_10[\'Sales\'], \'-\')<br>ax2.set_xlabel(\'Time\')<br>ax2.set_ylabel(\'Sales\')<br>ax2.set_title(\'Store 10 Sales Distribution\')",No,5,75.0
"import keras.backend as K
from keras.models import Model
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping
from keras import optimizers, losses, activations, models
from keras.layers import Conv2D, Dense, Input, Flatten, Concatenate, Dropout, Activation
from keras.layers import BatchNormalization, MaxPooling2D, GlobalAveragePooling2D
from keras import applications",No,5,22.0
test.isnull().sum(),No,5,39.0
"# check stores open distribution on days of week
import seaborn as sns
sns.countplot(x = 'DayOfWeek', hue = 'Open', data = test)
plt.title('Store Daily Open Countplot')",No,5,75.0
"# fill missing values in test with 1
test.fillna(value = 1, inplace = True)",No,5,17.0
"import seaborn as sns
import matplotlib.pyplot as plt

# check distribution of sales in train set
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
g1 = sns.distplot(train['Sales'],hist = True,label='skewness:{:.2f}'.format(train['Sales'].skew()),ax = ax1)
g1.legend()
g1.set(xlabel = 'Sales', ylabel = 'Density', title = 'Sales Distribution')
g2 = sns.distplot(np.log1p(train['Sales']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['Sales']).skew()),ax=ax2)
g2.legend()
g2.set(xlabel = 'log(Sales+1)',ylabel = 'Density', title = 'log(Sales+1) Distribution')
plt.show()",No,5,33.0
"# process train and test<br>def process(data, isTest = False):<br>    # label encode some features<br>    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}<br>    data.StoreType.replace(mappings, inplace=True)<br>    data.Assortment.replace(mappings, inplace=True)<br>    data.StateHoliday.replace(mappings, inplace=True)<br>    <br>    # extract some features from date column  <br>    data['Month'] = data.Date.dt.month<br>    data['Year'] = data.Date.dt.year<br>    data['Day'] = data.Date.dt.day<br>    data['WeekOfYear'] = data.Date.dt.weekofyear<br>    <br>    # calculate competiter open time in months<br>    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \\<br>        (data.Month - data.CompetitionOpenSinceMonth)<br>    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)<br>    <br>    # calculate promo2 open time in months<br>    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \\<br>        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0<br>    data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)<br>                                                 <br>    # Indicate whether the month is in promo interval<br>    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \\<br>             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}<br>    data['month_str'] = data.Month.map(month2str)<br><br>    def check(row):<br>        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:<br>            return 1<br>        else:<br>            return 0<br>        <br>    data['IsPromoMonth'] =  data.apply(lambda row: check(row),axis=1)    <br>    <br>    # select the features we need<br>    features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',<br>       'StoreType', 'Assortment', 'CompetitionDistance',<br>       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',<br>       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',<br>       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']  <br>    if not isTest:<br>        features.append('Sales')<br>        <br>    data = data[features]<br>    return data<br><br>train = process(train)<br>valid = process(valid)<br>train_total = process(train_total)<br>x_test = process(test,isTest = True)    ",No,5,8.0
"train_steps = int(np.ceil(train_flow.n / batch_size))
valid_steps = int(np.ceil(valid_flow.n / eval_batch_size))
test_steps = int(np.ceil(test_flow.n / eval_batch_size))",No,5,77.0
"# try random forest
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators = 15)
clf.fit(x_train, y_train)
# validation
y_pred = clf.predict(x_valid)
error = rmspe(np.expm1(y_valid), np.expm1(y_pred))
print('RMSPE: {:.4f}'.format(error))",Yes,3,4.0
"eval_res = pd.DataFrame(history.history)
eval_res.to_csv('eval_res_init.csv', index=False)
for c in ['acc', 'loss']:
    eval_res[[c, f'val_{c}']].plot(figsize=[18, 4]);
    plt.xlabel('Epoch'); plt.ylabel(c);
    plt.title(c); plt.grid();",Yes,3,56.0
"import xgboost as xgb<br><br>params = {""objective"": ""reg:linear"", # for linear regression<br>          ""booster"" : ""gbtree"",   # use tree based models <br>          ""eta"": 0.03,   # learning rate<br>          ""max_depth"": 10,    # maximum depth of a tree<br>          ""subsample"": 0.9,    # Subsample ratio of the training instances<br>          ""colsample_bytree"": 0.7,   # Subsample ratio of columns when constructing each tree<br>          ""silent"": 1,   # silent mode<br>          ""seed"": 10   # Random number seed<br>          }<br>num_boost_round = 4000<br><br>dtrain = xgb.DMatrix(x_train, y_train)<br>dvalid = xgb.DMatrix(x_valid, y_valid)<br>watchlist = [(dtrain, \'train\'), (dvalid, \'eval\')]<br># train the xgboost model<br>model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \\<br>  early_stopping_rounds= 100, feval=rmspe_xg, verbose_eval=True)",Yes,4,59.0
"# validation
y_pred = model.predict(xgb.DMatrix(x_valid))
error = rmspe(np.expm1(y_valid), np.expm1(y_pred))
print('RMSPE: {:.4f}'.format(error))
",Yes,4,48.0
x_train_total.head().append(x_train_total.tail()),No,5,11.0
"print(x_train_total.shape)
print(y_train_total.shape)",No,5,58.0
"dtrain = xgb.DMatrix(x_train_total, y_train_total)
dtest = xgb.DMatrix(x_test)
# specify parameters via map
params = {""objective"": ""reg:linear"", # for linear regression
          ""booster"" : ""gbtree"",   # use tree based models 
          ""eta"": 0.03,   # learning rate
          ""max_depth"": 10,    # maximum depth of a tree
          ""subsample"": 0.9,    # Subsample ratio of the training instances
          ""colsample_bytree"": 0.7,   # Subsample ratio of columns when constructing each tree
          ""silent"": 1,   # silent mode
          ""seed"": 10   # Random number seed
          }
num_round = 3000
model = xgb.train(params, dtrain, num_round)
# make prediction
preds = model.predict(dtest)",Yes,4,59.0
model.load_weights('model.h5'),No,5,30.0
"DATA_DIR = '../input/dogs-vs-cats-redux-kernels-edition/'
evals = pd.read_csv('../input/dvc-prepare-evalset/evals.csv')
evals['path'] = evals['path'].apply(lambda x: x.replace('../input/', DATA_DIR))
evals.head()",Yes,4,45.0
"H, W, C = 150, 150, 3
batch_size = 32
eval_batch_size = batch_size * 4",No,5,77.0
"import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator

train_gen = ImageDataGenerator(
    rotation_range=20,
    #width_shift_range=0.2,
    #height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    #channel_shift_range=0.2,
    horizontal_flip=True,
    #vertical_flip=True,
    #rescale=1./255,#!!!!NO!
    preprocessing_function=lambda x:(x-x.mean())/x.std()
)
test_gen = ImageDataGenerator(
    #rescale=1./255,
    preprocessing_function=lambda x:(x-x.mean())/x.std()
)",Yes,3,22.0
"eval_res = pd.DataFrame(history.history)
eval_res.to_csv('eval_res_finetune.csv', index=False)
for c in ['acc', 'loss']:
    eval_res[[c, f'val_{c}']].plot(figsize=[18, 4]);
    plt.xlabel('Epoch'); plt.ylabel(c);
    plt.title(c); plt.grid();",No,4,56.0
"pred_val.shape, valid_flow.classes.shape",No,5,58.0
"n_final_state = 32

def get_model(n_final_state, lr=1e-3, decay=1e-8):
    input_shape = (H, W, C)
    
    input_x = Input(shape=input_shape)
    
    c1 = Conv2D(32, (3, 3))(input_x)
    c1 = BatchNormalization()(c1)
    c1 = Activation('relu')(c1)
    c1 = MaxPooling2D((2, 2))(c1)
    
    c2 = Conv2D(32, (3, 3))(c1)
    c2 = BatchNormalization()(c2)
    c2 = Activation('relu')(c2)
    c2 = MaxPooling2D((2, 2))(c2)
    
    c3 = Conv2D(64, (3, 3))(c2)
    c3 = BatchNormalization()(c3)
    c3 = Activation('relu')(c3)
    c3 = MaxPooling2D((2, 2))(c3)
    
    flat = Flatten()(c3)
    
    d1 = Dense(
        64, activation='relu'
    )(flat)
    #d1 = Dropout(0.5)(d1)
    d1 = BatchNormalization()(d1)
    
    final_state = Dense(
        n_final_state, activation='relu', name='final_state'
    )(d1)
    
    x = Dropout(0.5)(final_state)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_x, outputs=outputs)
    optimizer=optimizers.Adam(lr=lr, decay=decay)
    model.compile(loss='binary_crossentropy', 
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

model = get_model(n_final_state=n_final_state)
model.summary()",No,5,4.0
"train_steps = int(np.ceil(train_flow.n / batch_size))
valid_steps = int(np.ceil(valid_flow.n / eval_batch_size))
test_steps = int(np.ceil(test_flow.n / eval_batch_size))
print(f'train {train_steps} steps')
print(f'valid {valid_steps} steps')
print(f'test {test_steps} steps')",No,5,77.0
"from sklearn.metrics import log_loss, accuracy_score<br>val_loss = log_loss(y_valid, pred_val)<br>val_acc = accuracy_score(y_valid, np.round(pred_val))<br>print(f'valid loss: {val_loss}\\t valid accuracy: {val_acc}')",No,5,49.0
"evals.loc[evals['is_test']==1, 'img_id'].shape",No,5,58.0
"eval_res = pd.DataFrame(history.history)
eval_res.to_csv('eval_res.csv', index=False)
for c in ['acc', 'loss']:
    eval_res[[c, f'val_{c}']].plot(figsize=[18, 6]);
    plt.xlabel('Epoch'); plt.ylabel(c);
    plt.title(c); plt.grid();",Yes,3,25.0
"subname = f'resnet50ft_{val_loss:.6f}.csv'
sub.to_csv(subname, index=False)
print(subname, 'saved')",No,5,25.0
"def predict(model, modelpath, data_flow, steps, workers=4, verbose=1):
    model.load_weights(modelpath)
    pred = model.predict_generator(
        generator=data_flow,
        steps=steps, 
        use_multiprocessing=True  if workers>1 else False, 
        workers=workers, 
        verbose=verbose
    )
    return pred",Yes,3,30.0
"train = pd.read_csv('../input/train.csv')
test  = pd.read_csv('../input/test.csv')",No,5,45.0
"from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  PolynomialFeatures",No,5,22.0
"print(pred_val_best.shape, pred_val_best_tta.shape)
sns.distplot(pred_val_best)
sns.distplot(pred_val_best_tta)
plt.legend(['normal', 'fliplr']);
plt.grid();",Yes,4,33.0
"X_train = train[['LotFrontage','LotArea']].fillna(0)
X_test  = test[['LotFrontage','LotArea']].fillna(0)
y_train = train['SalePrice']",Yes,4,17.0
"from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error",No,5,22.0
"for i,(p, p_tta) in enumerate(zip(pred_val_li, pred_val_tta_li)):
    print(i+2, 'th snapshot normal loss: {:.6f} acc: {:.6f}'.format(
        log_loss(y_valid, p),
        accuracy_score(y_valid, np.round(p))
    ))
    print(i+2, 'th snapshot tta    loss: {:.6f} acc: {:.6f}'.format(
        log_loss(y_valid, p_tta),
        accuracy_score(y_valid, np.round(p_tta))
    ))",No,5,49.0
"X_meta = pred_val_li + pred_val_tta_li
X_meta = np.hstack(X_meta)
X_meta.shape",Yes,4,11.0
"pipe.fit(X_train, y_train)",No,5,7.0
preds = pipe.predict(X_test),No,5,48.0
"sub = pd.DataFrame({'Id': test.Id, 'SalePrice': preds})
sub.to_csv('submission.csv', index=False)",No,5,25.0
"from sklearn.linear_model import LogisticRegressionCV
meta_model = LogisticRegressionCV(scoring='neg_log_loss')
meta_model.fit(X_meta, y_valid)
print(meta_model.coef_, meta_model.intercept_)",Yes,3,7.0
"train = pd.read_csv(""../input/comp_train.csv"")
test = pd.read_csv(""../input/comp_test.csv"")
print(train.shape)",Yes,4,45.0
"import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline 

import cv2

import os",No,5,23.0
"# Fixed for our Cats & Dogs classes
NUM_CLASSES = 2

# Fixed for Cats & Dogs color images
CHANNELS = 3

IMAGE_RESIZE = 224
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'categorical_crossentropy'

# Common accuracy metric for all outputs, but can use different metrics for different output
LOSS_METRICS = ['accuracy']

# EARLY_STOP_PATIENCE must be < NUM_EPOCHS
NUM_EPOCHS = 10
EARLY_STOP_PATIENCE = 3

# These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively
# Training images processed in each step would be no.-of-train-images / STEPS_PER_EPOCH_TRAINING
STEPS_PER_EPOCH_TRAINING = 10
STEPS_PER_EPOCH_VALIDATION = 10

# These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively
# NOTE that these BATCH* are for Keras ImageDataGenerator batching to fill epoch step input
BATCH_SIZE_TRAINING = 100
BATCH_SIZE_VALIDATION = 100

# Using 1 to easily manage mapping between test_generator & prediction for submission preparation
BATCH_SIZE_TESTING = 1",No,5,77.0
import matplotlib.pyplot as plt,No,5,22.0
"from tensorflow.python.keras.applications import ResNet50
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

### 
### Below systax is available with TensorFlow 1.11 onwards but this upgrade is not available for Kaggle kernel yet
###
#import tensorflow as tf
#print(tf.__version__)
#import tensorflow as tf
#from tf.keras.applications import ResNet50
#from tf.keras.models import Sequential",No,5,22.0
resnet_weights_path = '../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',No,5,77.0
"#Still not talking about our train/test data or any pre-processing.

model = Sequential()

# 1st layer as the lumpsum weights from resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
# NOTE that this layer will be set below as NOT TRAINABLE, i.e., use it as is
model.add(ResNet50(include_top = False, pooling = RESNET50_POOLING_AVERAGE, weights = resnet_weights_path))

# 2nd layer as Dense for 2-class classification, i.e., dog or cat using SoftMax activation
model.add(Dense(NUM_CLASSES, activation = DENSE_LAYER_ACTIVATION))

# Say not to train first layer (ResNet) model as it is already trained
model.layers[0].trainable = False",No,5,4.0
"plt.scatter(train[""OverallQual""],train[""SalePrice""])",No,5,33.0
"plt.scatter(train[""TotalBsmtSF""],train[""SalePrice""])",No,5,33.0
"plt.scatter(train[""YearBuilt""],train[""SalePrice""])",No,5,33.0
"X_train = train[""TotalBsmtSF""].values
y_train = train[""SalePrice""].values


",No,5,21.0
"fit_history = model.fit_generator(
        train_generator,
        steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
        epochs = NUM_EPOCHS,
        validation_data=validation_generator,
        validation_steps=STEPS_PER_EPOCH_VALIDATION,
        callbacks=[cb_checkpointer, cb_early_stopper]
)
model.load_weights(""../working/best.hdf5"")",No,4,30.0
"import numpy as np
m=train.shape[0]

#changing the shape to ,x1
one=np.ones((m,1))
X_train = X_train.reshape((m,1))
y_train = y_train.reshape((m,1))
X1=np.hstack((X_train,one))
",Yes,3,11.0
" plt.figure(1, figsize = (15,8)) 
    
plt.subplot(221)  
plt.plot(fit_history.history['acc'])  
plt.plot(fit_history.history['val_acc'])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 
    
plt.subplot(222)  
plt.plot(fit_history.history['loss'])  
plt.plot(fit_history.history['val_loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 

plt.show()",No,5,35.0
"TRAIN_PATH = os.path.join(""../input/ai-academy-intermediate-class-competition-1"", ""BBC News Train.csv"")

#Load the data using pandas : Create a DataFrame named df, that contains the training data 
df = pd.read_csv(TRAIN_PATH)",No,5,45.0
"X_test=test.iloc[:,1:].values
m_test=X_test.shape[0]
one=np.ones((m_test,1))
X_test=np.hstack((X_test,one))",Yes,3,11.0
"# List first 5 entries in dataframe to make sure it was loaded properly
# and review the various colums in the dataframe

df.head()",No,5,41.0
"# Associate Category names with numerical index and save it in new column category_id
df['category_id'] = df['Category'].factorize()[0]

#View first 10 entries of category_id, as a sanity check
df['category_id'][0:10]",No,5,8.0
"prediction=np.dot(X_test,theta)
prediction",No,5,48.0
"# Create a new pandas dataframe ""category_id_df"", which only has unique Categories, also sorting this list in order of category_id values<br>category_id_df = df[[\'Category\', \'category_id\']].drop_duplicates().sort_values(\'category_id\')",No,5,9.0
"sub=pd.DataFrame()<br>sub[\'Id\'] = test[\'Id\']<br>sub[\'SalePrice\']=prediction<br>sub.to_csv(""prediction.csv"", index = False)",Yes,4,25.0
"# Create a dictionary ( python datastructure - like a lookup table) that 
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)",No,5,77.0
print(prediction.shape),No,5,58.0
"# Pick 5 random samples from the dataframe
df.sample(5, random_state=0)",No,5,41.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

RANDOM_STATE = 31415",Yes,4,22.0
"# Group the dataframe by categories and count items ( number of news articles) in each category
df.groupby('Category').category_id.count()
",No,5,72.0
"#Plot the distribution of news articles by category
df.groupby('Category').category_id.count().plot.bar(ylim=0)",No,5,33.0
"from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Text).toarray() # Remaps the words in the 1490 articles in the text column of 
                                                  # data frame into features (superset of words) with an importance assigned 
                                                  # based on each words frequency in the document and across documents

labels = df.category_id                           # represents the category of each of the 1490 articles
",No,5,8.0
"#Get a feel of the features identified by tfidf
features.shape # How many features are there ? ",No,5,58.0
"# metric to optimize
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

scorer = make_scorer(lambda y_test, predictions: np.sqrt(mean_squared_error(y_test, predictions)))",Yes,4,49.0
"colors = [\'pink\', \'green\', \'midnightblue\', \'orange\', \'darkgrey\']<br><br># Find points belonging to each category and plot them<br>for category, category_id in sorted(category_to_id.items()):<br>    points = projected_features[(labels[indices] == category_id).values]<br>    plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)<br>plt.title(""tf-idf feature vector for each article, projected on 2 dimensions."",<br>          fontdict=dict(fontsize=15))<br>plt.legend()",No,5,33.0
"training_set = pd.read_csv('../input/train.csv')
training_set['datetime'] = training_set['datetime'].apply(lambda x: pd.to_datetime(x).timestamp())",Yes,4,45.0
features.shape,No,5,58.0
training_set.head(),No,5,41.0
"from sklearn.model_selection import train_test_split

# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['registered', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.casual, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['casual'])
X_test = X_test.drop(columns = ['casual'])",Yes,3,13.0
"from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
?np.random.uniform",No,5,84.0
"casual_model.fit(X_train, y_train)
casual_model = casual_model.best_estimator_",Yes,4,7.0
"# Same thing for the second variable
# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['casual', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.registered, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['registered'])
X_test = X_test.drop(columns = ['registered'])",Yes,3,13.0
"registered_model.fit(X_train, y_train)
registered_model = registered_model.best_estimator_",Yes,4,7.0
"from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
",Yes,4,82.0
"# Final prediction of the baseline models, as I am not going to tweak them, I will move directly to the test data<br><br>test_dataset = pd.read_csv(""../input/test.csv"")<br>dates = test_dataset[\'datetime\']<br>test_dataset[\'datetime\'] = test_dataset[\'datetime\'].apply(lambda x: pd.to_datetime(x).timestamp())",Yes,4,45.0
"CV = 5  # Cross Validate with 5 different folds of 20% data ( 80-20 split with 5 folds )

#Create a data frame that will store the results for all 5 trials of the 3 different models
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = [] # Initially all entries are empty",No,4,12.0
"casual = casual_model.predict(test_data)
registered = registered_model.predict(test_data)
total = casual + registered",No,5,48.0
"#For each Algorithm 
for model in models:
  model_name = model.__class__.__name__
  # create 5 models with different 20% test sets, and store their accuracies
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  # Append all 5 accuracies into the entries list ( after all 3 models are run, there will be 3x5 = 15 entries)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))",No,5,3.0
"# Store the entries into the results dataframe and name its columns    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])",No,5,12.0
"test_dataset['datetime'] = dates
test_dataset['count'] = pd.Series(total)",No,5,8.0
"import seaborn as sns<br><br>sns.boxplot(x=\'model_name\', y=\'accuracy\', data=cv_df)<br>sns.stripplot(x=\'model_name\', y=\'accuracy\', data=cv_df, <br>              size=8, jitter=True, edgecolor=""gray"", linewidth=2)",No,5,33.0
test_dataset[test_dataset['count'] < 0],No,5,14.0
"# Mean accuracy of each algorithm
cv_df.groupby('model_name').accuracy.mean()",No,5,60.0
"test_dataset.loc[test_dataset['count'] < 0, 'count'] = 0",No,5,8.0
test_dataset[test_dataset['count'] <= 0],No,5,14.0
cv_df,No,5,41.0
"test_dataset[['datetime', 'count']].to_csv('result.csv', index = False)",No,5,25.0
"from sklearn.model_selection import train_test_split

model = LogisticRegression(random_state=0)

#Split Data 
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)

#Train Algorithm
model.fit(X_train, y_train)

# Make Predictions
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)",Yes,3,7.0
"import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
import os
print(os.listdir(""../input""))",No,5,88.0
"df = pd.read_csv(""../input/train.tsv"", sep=""\\t"")<br>df_test = pd.read_csv(""../input/test.tsv"", sep=""\\t"")",No,5,45.0
df.shape,No,5,58.0
"from sklearn.metrics import confusion_matrix
import seaborn as sns

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')",No,5,80.0
"model.fit(features, labels)",No,5,7.0
"import os
print(os.listdir(""../input/bbc-test""))",No,5,88.0
"TEST_PATH = os.path.join(""../input/bbc-test"", ""BBC News Test.csv"")

#Load the data using pandas : Create a DataFrame
test_df = pd.read_csv(TEST_PATH)

",No,5,45.0
"from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold",No,5,22.0
"test_features = tfidf.transform(test_df.Text.tolist())

Y_pred = model.predict(test_features)

Y_pred",Yes,3,8.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss",No,5,22.0
"#Create Submission Dataframe
submission = pd.DataFrame({
        ""ArticleId"": test_df[""ArticleId""],
        ""Category"": Y_pred_name
    })",No,5,12.0
"# Convert submission dataframe to csv 
# you could use any filename. We choose submission here
submission.to_csv('submission.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input/""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import gc",No,5,22.0
"def PrepareFeatures(TestOrTrain):<br>    source = f\'../input/{TestOrTrain}.json\'<br>    data = pd.read_json(source)<br>    # Some noise in price feature we saw in Part 1:<br>    ulimit = np.percentile(data.price.values, 99)<br>    data[\'price\'][data[\'price\']>ulimit] = ulimit<br>    # Get Different features as in Part 1:<br>    data[\'hasDesc\'] = data[\'description\'].apply(lambda x: len(x.strip())!=0)<br>    data[""nFeatures""] = data[""features""].apply(len)<br>    data[""nDescWords""] = data[""description""].apply(lambda x: len(x.split("" "")))<br>    data[\'nPhotos\'] = data[\'photos\'].apply(lambda x: min(10, len(x)))<br>    data[\'created\'] = pd.to_datetime(data[\'created\'])<br>    data[\'month\'] = data[\'created\'].dt.month<br>    data[\'weekday\'] = data[\'created\'].apply(lambda x: x.weekday())<br>    return data<br><br># Using categorical (more sparse) data, we ispected in Part 1:<br>def CreateCategFeat(data, features_list):<br>    f_dict = {\'hasParking\':[\'parking\', \'garage\'], \'hasGym\':[\'gym\', \'fitness\', \'health club\'],<br>              \'hasPool\':[\'swimming pool\', \'pool\'], \'noFee\':[\'no fee\', ""no broker\'s fees""],<br>              \'hasElevator\':[\'elevator\'], \'hasGarden\':[\'garden\', \'patio\', \'outdoor space\'],<br>              \'isFurnished\': [\'furnished\', \'fully  equipped\'], <br>              \'reducedFee\':[\'reduced fee\', \'low fee\'],<br>              \'hasAC\':[\'air conditioning\', \'central a/c\', \'a/c\', \'central air\', \'central ac\'],<br>              \'hasRoof\':[\'roof\', \'sundeck\', \'private deck\', \'deck\'],<br>              \'petFriendly\':[\'pets allowed\', \'pet friendly\', \'dogs allowed\', \'cats allowed\'],<br>              \'shareable\':[\'shares ok\'], \'freeMonth\':[\'month free\'],<br>              \'utilIncluded\':[\'utilities included\']}<br>    for feature in features_list:<br>        data[feature] = False<br>        for ind, row in data.iterrows():<br>            for f in row[\'features\']:<br>                f = f.lower().replace(\'-\', \'\')<br>                if any(e in f for e in f_dict[feature]):<br>                    data.at[ind, feature]= True",No,5,8.0
"data = PrepareFeatures(\'train\')<br>cat_features = [\'hasParking\', \'hasGym\', \'hasPool\', \'noFee\', \'hasElevator\',<br>                \'hasGarden\', \'isFurnished\', \'reducedFee\', \'hasAC\', \'hasRoof\',<br>                \'petFriendly\', \'shareable\', \'freeMonth\', \'utilIncluded\']<br>CreateCategFeat(data, cat_features)<br>features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",<br>             ""nPhotos"", ""hasDesc"", \'nFeatures\', \'nDescWords\', ""month"", \'weekday\']<br>features.extend(cat_features)<br>X = data[features]<br>y = data[""interest_level""]",No,5,21.0
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05)
clf = RandomForestClassifier(n_estimators=2000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)",Yes,3,7.0
"test = PrepareFeatures('test')
CreateCategFeat(test, cat_features)
X = test[features]
y = clf.predict_proba(X)",Yes,4,48.0
"labels2idx = {label: i for i, label in enumerate(clf.classes_)}
sub = pd.DataFrame()
sub[""listing_id""] = test[""listing_id""]
for label in [""high"", ""medium"", ""low""]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv(""submission_rf.csv"", index=False)",Yes,4,25.0
"import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
",Yes,4,22.0
"df = pd.read_csv('../input/training/training.csv')
df_test=pd.read_csv('../input/test/test.csv')

df.dropna(inplace=True)
df.shape",Yes,3,45.0
"y = df.iloc[:, :-1].values
y.shape",Yes,4,14.0
"from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_val.shape",Yes,4,13.0
"x_train.shape, x_val.shape",No,5,58.0
"# Definir correctamente la red neuronal (5 pts)<br>from keras.models import Sequential <br>from keras.layers import GlobalAveragePooling2D, Dense, Flatten,BatchNormalization, Dropout, Conv2D, MaxPool2D<br>from keras.optimizers import Adam, SGD<br>from keras import regularizers<br><br>lr = 0.01<br>bs = 256<br>nb = math.ceil(len(x_train)/bs)<br><br>final_model  = Sequential([<br>    Conv2D(32, 3, activation='relu', input_shape=(96,96,1)),<br>    MaxPool2D(),<br>    Conv2D(16, 3, activation='relu'),<br>    GlobalAveragePooling2D(),<br>    Dense(256, activation='relu', kernel_initializer='glorot_normal'),<br>    Dropout(0.7),<br>    Dense(128, activation='relu', kernel_initializer='glorot_normal'),<br>    Dense(64, activation='relu', kernel_initializer='glorot_normal'),<br>    Dense(30) #no se utiliza funcin de activacin porque se requiere hacer una regresin para cada coordenada<br>])<br>final_model .compile(Adam(lr), loss='mse', metrics=['mae'])<br>final_model .summary()",Yes,4,4.0
"log = final_model.fit(x_train, y_train, batch_size=100, epochs=100,validation_data=[x_val, y_val])",No,5,7.0
"# Resultado del entrenamiento
# - mae entre 10 y 15 (3 pts)
# - mae entre 8 y 11 (5 pts)
# - mae entre 5 y 8 (7 pts)
# - mae menor o igual a 4.0 (9 pts)

print(f'MAE final: {final_model.evaluate(x_val, y_val)[1]}')",No,5,49.0
"x_val[0,None].shape",No,5,58.0
"results=final_model.predict(test)
results.shape",Yes,4,48.0
"lookup = pd.read_csv('../input/IdLookupTable.csv')
",No,5,45.0
"submission = pd.concat([rowid,loc],axis = 1)",No,5,11.0
"submission.to_csv('submission2.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input/atividade-3-pmr3508""))<br>print(os.listdir(""../input/distance-to-coast""))<br>print(os.listdir(""../input/califdata""))<br># Any results you write to the current directory are saved as output.",No,5,88.0
"train_data = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"")<br>test_data = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"")<br>#Saving the id\'s, just in case they are needed in the submission<br>train_Id = train_data.loc[:,\'Id\']<br>test_Id = test_data.loc[:,\'Id\']<br>test_data = test_data.drop(\'Id\',axis = \'columns\')<br>train_data = train_data.drop(\'Id\',axis = \'columns\')<br>train_data<br><br>",Yes,3,45.0
"def adding_new_features(df):
    df.loc[:,'mean_rooms'] = df.loc[:,'total_rooms']/df.loc[:,'households']
    df.loc[:,'rooms_per_person'] = df.loc[:,'total_rooms']/df.loc[:,'population']
    df.loc[:,'mean_bedrooms'] = df.loc[:,'total_bedrooms']/df.loc[:,'households']
    df.loc[:,'bedrooms_per_person'] = df.loc[:,'total_bedrooms']/df.loc[:,'households']
    df.loc[:,'persons_per_household'] = df.loc[:,'population']/df.loc[:,'households']
    df.loc[:, 'median_income_per_person'] = df.loc[:,'median_income']/df.loc[:,'persons_per_household']
adding_new_features(train_data)
adding_new_features(test_data)
train_data


",No,5,8.0
"train_data['longitude'].plot(kind='hist')
test_data['longitude'].plot(kind='hist')
",No,5,33.0
"train_data['latitude'].plot(kind='hist')
test_data['latitude'].plot(kind='hist')",No,5,33.0
"dist2coast = pd.read_csv(""../input/distance-to-coast/dist2coast.txt"",delim_whitespace = True)
",No,5,45.0
"def saving(name,y_predict): # ""saving"" some time with a compressed writing code<br>    df = pd.DataFrame()<br>    df[\'Id\'] = test_Id<br>    df.set_index(\'Id\', inplace=True)<br>    df[\'median_house_value\'] =y_predict<br>    print(df)<br>    return df.to_csv(name)",Yes,4,25.0
"from sklearn import tree<br>x_train_data = train_data.drop(\'median_house_value\', axis = \'columns\')<br>y_train_data = train_data.loc[:,\'median_house_value\']<br>reg1 = tree.DecisionTreeRegressor(max_depth = 1)<br>reg1 = reg1.fit(x_train_data, y_train_data)<br>DTR_y = reg1.predict(test_data)<br>saving(""DecisionTreeRegression_1.csv"",DTR_y)<br>",Yes,3,7.0
"from sklearn.neighbors import KNeighborsRegressor
reg2 = KNeighborsRegressor(n_neighbors=50) #50nn = 0.37494 at 70% of the test database, 1000nn = 0.38598
reg2 = reg2.fit(x_train_data, y_train_data) 
knnR_y = reg2.predict(test_data)
saving(""50nn Regressor.csv"", knnR_y)",Yes,3,7.0
"from sklearn import linear_model
reg3 = linear_model.LassoLars(alpha=.1, positive = True)
reg3.fit(x_train_data, y_train_data)  
print (reg3.coef_)
LASSO_y = reg3.predict(test_data)
saving(""LASSO LARS.csv"", LASSO_y) # score = 0.38951
",Yes,3,7.0
"from sklearn.neural_network import MLPRegressor
reg4 = MLPRegressor()
reg4.fit(x_train_data, y_train_data)                         
MLP_y = reg4.predict(test_data)
saving(""MultiLayerPerceptrons Regressor.csv"", MLP_y) #0.373",Yes,3,7.0
"reg5 = linear_model.BayesianRidge()
reg5.fit(x_train_data, y_train_data)
BRR_y = reg5.predict(test_data)
scores5 = cross_val_score(reg5, x_train_data, y_train_data, cv = 10)
saving(""Bayesian Ridge Regressor.csv"", BRR_y)",Yes,3,7.0
"reg7 = ExtraTreesRegressor()
reg7.fit(x_train_data, y_train_data)
ET_y = reg7.predict(test_data)
saving(""ExtraTreesRegressor.csv"", ET_y) # scores = 0.24383
",Yes,3,7.0
"from sklearn.ensemble import AdaBoostRegressor
reg8 = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=20), n_estimators=50)
reg8.fit(x_train_data, y_train_data)
ADA_y = reg8.predict(test_data)
saving(""ADA Boost Regression.csv"",ADA_y) # 0.22756",Yes,3,7.0
"reg_F = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=20), n_estimators=50)
reg_F.fit(x_train_data, y_train_data)
final = reg_F.predict(test_data)
saving(""Final ADA Boost Regression.csv"",final) # 0.22756",Yes,3,7.0
"train = pd.read_csv('../input/train.csv')
test  = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/test.csv')",No,5,45.0
"x_train = train[['LotArea','LotFrontage']].copy()
y_train = train['SalePrice'].copy()",No,5,21.0
"x_test = test[['LotArea','LotFrontage']].copy()",No,5,21.0
y_train.head(),No,5,41.0
x_train.head(),No,5,41.0
x_train.shape,No,5,58.0
x_train.isnull().sum(),No,5,39.0
"x_train.fillna(0,inplace=True)
x_test.fillna(0,inplace=True)",No,5,17.0
"from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler",No,5,22.0
"x_train_scaler = x_scaler.fit_transform(x_train)
x_test_scaler = x_scaler.transform(x_test)",No,5,18.0
lr = LinearRegression(),No,5,4.0
"lr.fit(x_train_scaler,y_train)",No,5,7.0
pred = lr.predict(x_test_scaler),No,5,48.0
pd.read_csv('../input/sample_sumbission.csv').head(),No,5,45.0
"sub = pd.DataFrame(data = {'Id' : test.Id, 'SalePrice' :pred})",No,5,12.0
"sub.to_csv('submission.csv', index=False)",No,5,25.0
"import cv2                 # working with, mainly resizing, images
import numpy as np         # dealing with arrays
import os                  # dealing with directories
from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.

train_dir = '../input/train'
test_dir = '../input/test'",No,5,77.0
"X = np.array([i[0] for i in train]).reshape(-1,1,50,50)
Y = [i[1] for i in train]

test_x = np.array([i[0] for i in test]).reshape(-1,1,50,50)
test_y = [i[1] for i in test]",No,5,21.0
"from keras.models import Sequential
from keras.layers import Dense , Activation
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers import Convolution2D
from keras.layers import Conv2D , BatchNormalization
from keras.layers import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
K.set_image_dim_ordering('th')",No,5,23.0
"# Initialising the CNN
classifier = Sequential()

# Step 1 - Convolution
classifier.add(Convolution2D(32, 3, 3, input_shape = (1,50,50), activation = 'relu'))

# Step 2 - Pooling
classifier.add(MaxPooling2D(pool_size = (2, 2)))

# Adding a second convolutional layer
classifier.add(Convolution2D(32, 3, 3, activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))


# Adding a third convolutional layer
classifier.add(Convolution2D(64, 3, 3, activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))


# Step 3 - Flattening
classifier.add(Flatten())

# Step 4 - Full connection
classifier.add(Dense(output_dim = 64, activation = 'relu'))
classifier.add(Dropout(0.4))
classifier.add(Dense(output_dim = 2, activation = 'sigmoid'))

",No,5,4.0
"import matplotlib.pyplot as plt<br>%matplotlib inline<br><br>from IPython.display import set_matplotlib_formats<br>set_matplotlib_formats('pdf', 'png')<br>pd.options.display.float_format = '{:.2f}'.format<br>rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': [12, 8], 'axes.labelsize': 18,\\<br>   'axes.titlesize': 18, 'font.size': 18, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 16,\\<br>   'xtick.labelsize': 16, 'ytick.labelsize': 16}<br><br>sns.set(style='dark',rc=rc)",Yes,4,23.0
"# Setting working directory

path = '../input/'
path_result = '../output/'",No,5,77.0
"train = pd.read_csv(path + \'train_data.csv\')<br>test = pd.read_csv(path + \'teste_data.csv\')<br>train = train.rename(columns={""default"": ""target"", ""ids"":""id""})<br>test = test.rename(columns={""ids"":""id""})",No,4,45.0
test_id = test.id,No,5,77.0
"def missing_values_table(df):<br>        mis_val = df.isnull().sum()<br>        mis_val_percent = 100 * df.isnull().sum() / len(df)<br>        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)<br>        mis_val_table_ren_columns = mis_val_table.rename(<br>        columns = {0 : \'Missing Values\', 1 : \'% of Total Values\'})<br>        mis_val_table_ren_columns = mis_val_table_ren_columns[<br>            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(<br>        \'% of Total Values\', ascending=False).round(1)<br>        print (""Your selected dataframe has "" + str(df.shape[1]) + "" columns.\<br>""      <br>            ""There are "" + str(mis_val_table_ren_columns.shape[0]) +<br>              "" columns that have missing values."")<br>        return mis_val_table_ren_columns",No,5,53.0
missing_values_table(train),No,5,39.0
"missingValueColumns = train.columns[train.isnull().any()].tolist()
df_null = train[missingValueColumns]",No,5,10.0
"msno.bar(df_null,figsize=(20,8),color=default_color,fontsize=18,labels=True)",No,5,34.0
"msno.heatmap(df_null,figsize=(20,8),cmap=colormap)",No,5,80.0
"msno.dendrogram(df_null,figsize=(20,8))",No,5,34.0
train = train.dropna(subset=['target']),No,5,17.0
"plt.figure(figsize=(15,5))

ax = sns.countplot('target',data=train,color=default_color)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(100*p.get_height()/len(train['target'])), (p.get_x()+ 0.3, p.get_height()+0.2))",No,5,33.0
"meta_data = get_meta(train)
meta_data",No,5,77.0
"meta_counts = meta_data.groupby(['role', 'level']).agg({'dtype': lambda x: x.count()}).reset_index()
meta_counts",No,5,60.0
"fig,ax = plt.subplots()<br>fig.set_size_inches(20,5)<br>sns.barplot(data=meta_counts[(meta_counts.role != \'target\') & (meta_counts.role != \'id\') ],x=""level"",y=""dtype"",ax=ax,color=default_color)<br>ax.set(xlabel=\'Variable Type\', ylabel=\'Count\',title=""Variables Count Across Datatype"")",No,5,33.0
"col_ordinal   = meta_data[(meta_data.level == 'ordinal') & (meta_data.keep)].index
col_nominal   = meta_data[(meta_data.level == 'nominal') & (meta_data.keep)& (meta_data.role != 'target')& (meta_data.role != 'id')].index
col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index
col_binary    = meta_data[(meta_data.level == 'binary') & (meta_data.keep) & (meta_data.role != 'target')].index",No,5,14.0
"def count_label_encoding(train, test,col):
    for i in col:
        df1 = train[i].value_counts().reset_index(name='freq_'+ i).rename(columns={'index': 'lc_'+ i})
        train = pd.merge(train,df1,left_on=i, right_on='lc_'+ i, how='left')
        test = pd.merge(test,df1,left_on=i, right_on='lc_'+ i, how='left')
        
    for i in list(train):
        if 'lc_' in i:
            train = train.drop(i, axis = 1)
            test = test.drop(i, axis = 1)
    return train, test",No,4,10.0
"train, test = count_label_encoding(train, test,col_nominal)
train, test = count_label_encoding(train, test,col_binary)",No,5,53.0
"plt.figure(figsize=(18,16))
plt.title('Pearson correlation of continuous features', y=1.05, size=15)
sns.heatmap(train[col_interval].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True, fmt = '.2f')",No,5,80.0
from sklearn.model_selection import train_test_split,No,5,22.0
"X = pd.concat([train[col_interval],train[col_ordinal],pd.get_dummies(train[col_binary])], axis=1)
y = pd.DataFrame(train.target)
X.fillna(-1, inplace=True) 
y.fillna(-1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)",Yes,3,21.0
y.shape,No,5,58.0
X.shape,No,5,58.0
X.head(),No,5,41.0
"plt.figure(figsize=(18,16))
plt.title('Pearson correlation of continuous features', y=1.05, size=15)
sns.heatmap(X.corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=False, fmt = '.1f')",No,5,80.0
"from sklearn.ensemble import RandomForestClassifier<br>rf = RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_leaf=30, max_features=0.2, n_jobs=-1, random_state=0)<br>rf.fit(X_train, y_train[\'target\'])<br>features = X_train.columns.values<br>print(""----- Training Done -----"")",No,5,7.0
"from sklearn.metrics import accuracy_score, roc_auc_score",No,5,22.0
"acc = accuracy_score(y_train, rf.predict(X_train))
auc = roc_auc_score(y_train, rf.predict(X_train))
print(""Accuracy: %.4f"" % acc)
print(""AUC: %.4f"" % auc)",No,5,28.0
"acc = accuracy_score(y_test, rf.predict(X_test))
auc = roc_auc_score(y_test, rf.predict(X_test))
print(""Accuracy: %.4f"" % acc)
print(""AUC: %.4f"" % auc)",No,5,49.0
"def get_feature_importance_df(feature_importances, <br>                              column_names, <br>                              top_n=25):<br>    """"""Get feature importance data frame.<br> <br>    Parameters<br>    ----------<br>    feature_importances : numpy ndarray<br>        Feature importances computed by an ensemble <br>            model like random forest or boosting<br>    column_names : array-like<br>        Names of the columns in the same order as feature <br>            importances<br>    top_n : integer<br>        Number of top features<br> <br>    Returns<br>    -------<br>    df : a Pandas data frame<br> <br>    """"""<br>     <br>    imp_dict = dict(zip(column_names, <br>                        feature_importances))<br>    top_features = sorted(imp_dict, <br>                          key=imp_dict.get, <br>                          reverse=True)[0:top_n]<br>    top_importances = [imp_dict[feature] for feature <br>                          in top_features]<br>    df = pd.DataFrame(data={\'feature\': top_features, <br>                            \'importance\': top_importances})<br>    return df",No,5,86.0
"feature_importance = get_feature_importance_df(rf.feature_importances_, features)
feature_importance",No,5,86.0
"fig,ax = plt.subplots()<br>fig.set_size_inches(20,10)<br>sns.barplot(data=feature_importance[:10],x=""feature"",y=""importance"",ax=ax,color=default_color,)<br>ax.set(xlabel=\'Variable name\', ylabel=\'Importance\',title=""Variable importances"")",No,5,79.0
"from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier",No,5,22.0
"#RandomForest params
rf_params = {}
rf_params['n_estimators'] = 200
rf_params['max_depth'] = 6
rf_params['min_samples_split'] = 70
rf_params['min_samples_leaf'] = 30",No,5,59.0
"rf_model = RandomForestClassifier(**rf_params, random_state=29,n_jobs = -1)",No,5,4.0
"# XGBoost params
xgb_params = {}
xgb_params['learning_rate'] = 0.02
xgb_params['n_estimators'] = 1000
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9",No,5,59.0
"XGB_model = XGBClassifier(**rf_params, random_state=29,n_jobs=-1)",No,5,4.0
"def create_extra_features(train_ext):
    train_ext['null_sum'] = train_ext[train_ext==-1].count(axis=1)
    #train_ext['bin_sum']  = train_ext[col_binary].sum(axis=1)
    train_ext['ord_sum']  = train_ext[col_ordinal].sum(axis=1)
    train_ext['interval_median']  = train_ext[col_interval].sum(axis=1)
    train_ext['new_amount_borrowed_by_income']  = train_ext['amount_borrowed']/train_ext['income']
    train_ext['new_amount_borrowed_by_months']  = train_ext['amount_borrowed']/train_ext['borrowed_in_months']
    return train_ext",No,5,8.0
ids_targets = meta_data[meta_data['role'] != 'input'].index,No,5,77.0
train_ext.head(),No,5,41.0
test_ext.head(),No,5,41.0
"train_ext.fillna(-1, inplace = True)

X_ext = pd.concat([train_ext[col_interval],train_ext[col_ordinal], pd.get_dummies(train_ext[col_binary])], axis=1)
X_ext.head()",Yes,3,17.0
"X_ext = X_ext.drop(columns = ['facebook_profile_False','gender_f'], axis=1)",No,5,10.0
"test_ext = pd.concat([test_ext[col_interval],test_ext[col_ordinal], pd.get_dummies(test_ext[col_binary])], axis=1)
test_ext.fillna(-1, inplace = True)
#X_ext = X_ext.drop(columns = ids_targets, axis =1)
y_ext = pd.DataFrame(train_ext.target) #train_lc.target_default.ravel(order='K') #pd.DataFrame(train_ext.target)
y_ext=y_ext.astype('bool')
y_ext = y_ext.values
y_ext = y_ext.reshape(-1)",Yes,3,21.0
"cols = list(X_ext)
test_ext = test_ext[cols]",No,5,77.0
X_ext.head(),No,5,41.0
"from sklearn.utils.multiclass import type_of_target
type_of_target(y_ext)",No,4,70.0
X_ext.shape,No,5,58.0
test_ext.shape,No,5,58.0
"X_train, X_test, y_train, y_test = train_test_split(X_ext, y_ext, test_size=0.2, random_state=42)",No,5,13.0
from sklearn.model_selection import GridSearchCV,No,5,22.0
"tuned_parameters = [{'max_depth': [4,5,6,7,8,9,10],
                     'max_features': [4,5,6,7,8,9,10],
                    'n_estimators':[10,25,50,75]}]

clf = GridSearchCV(RandomForestClassifier(random_state=29), tuned_parameters, cv=3, scoring='roc_auc')
clf.fit(X_train, y_train)",No,5,6.0
"from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe",No,5,22.0
"import random
import itertools
N_HYPEROPT_PROBES = 10
EARLY_STOPPING = 80
HOLDOUT_SEED = 123456
HOLDOUT_SIZE = 0.10
HYPEROPT_ALGO = tpe.suggest  #  tpe.suggest OR hyperopt.rand.suggest
DATASET = 'clean' # 'raw' | 'clean' | 'extended'
SEED0 = random.randint(1,1000000000)
NB_CV_FOLDS = 5",No,5,59.0
"obj_call_count = 0
cur_best_score = 0",No,5,77.0
"space_RF ={<br>    \'n_estimators\'           : hp.choice(\'n_estimators\',         np.arange(10, 200,  dtype=int)),       <br>    \'max_depth\'              : hp.choice(""max_depth"",            np.arange(3, 15,    dtype=int)),<br>    \'min_samples_split\'      : hp.choice(""min_samples_split"",    np.arange(20, 100,    dtype=int)),<br>    \'min_samples_leaf\'       : hp.choice(""min_samples_leaf"",    np.arange(10, 100,    dtype=int)),<br>    \'criterion\'              : hp.choice(\'criterion\', [""gini"", ""entropy""]),<br>    \'class_weight\'           : hp.choice(\'class_weight\', [\'balanced_subsample\', None]),<br>    \'n_jobs\'                 : -1,<br>    \'oob_score\'              : True,<br>    \'random_state\'           :  hp.randint(\'random_state\',2000000)<br>   }<br>#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 9, \'min_samples_leaf\': 74, \'min_samples_split\': 12, \'n_estimators\': 134, \'random_state\': 1433254}<br>#Params: class_weight=balanced_subsample criterion=entropy max_depth=11 min_samples_leaf=2 min_samples_split=29 n_estimators=89 n_jobs=-1 oob_score=True<br>#Params: class_weight=balanced_subsample criterion=entropy max_depth=10 min_samples_leaf=2 min_samples_split=17 n_estimators=38 n_jobs=-1 oob_score=True",No,5,5.0
"space_XGB ={<br>    \'max_depth\'        : hp.choice(""max_depth"", np.arange(5, 15,dtype=int)), <br>    \'learning_rate\'    : hp.loguniform(\'learning_rate\', -4.9, -3.0),<br>    \'n_estimators\'     : hp.choice(\'n_estimators\', np.arange(10, 100,dtype=int)),<br>    \'objective\'        : \'binary:logistic\',<br>    \'booster\'          : \'gbtree\',       <br>    \'reg_alpha\'        :  hp.uniform(\'reg_alpha\', 1e-5, 1e-1),<br>    \'reg_lambda\'       :  hp.uniform(\'reg_lambda\', 1e-5, 1e-1), <br>    \'colsample_bytree\' : hp.uniform(\'colsample_bytree\', 0.5, 0.8),<br>    \'min_child_weight \': hp.uniform(\'min_child_weight\', 0.5, 0.8),   <br>    \'random_state\'     :  hp.randint(\'random_state\',2000000)<br>   }",No,5,5.0
"train_stack = train_backup.copy()
test_stack  = test_backup.copy()",No,5,12.0
train_stack.shape,No,5,58.0
test_stack.shape,No,5,58.0
"meta_data = get_meta(train_stack)
col_ordinal   = meta_data[(meta_data.level == 'ordinal') & (meta_data.keep)& (meta_data.role != 'target')].index
col_nominal   = meta_data[(meta_data.level == 'nominal') & (meta_data.keep)& (meta_data.role != 'target')].index
col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)& (meta_data.role != 'target')].index
col_binary    = meta_data[(meta_data.level == 'binary') & (meta_data.keep) & (meta_data.role != 'target')].index
#meta_data",No,5,14.0
"train_stack = train_stack.replace(-1, np.NaN)

d_median    = train_stack.median(axis=0)
d_mean      = train_stack.mean(axis=0)

train_stack = train_stack.fillna(-1)",No,5,17.0
from sklearn import preprocessing,No,5,22.0
"
train_stack = create_extra_features(train_stack)
test_stack = create_extra_features(test_stack)
train_stack['bin_sum']  = train_stack[col_binary].sum(axis=1)
test_stack['bin_sum']  = test_stack[col_binary].sum(axis=1)",No,5,8.0
"col = [c for c in train_stack.columns if c not in ['id','target']]
col = [c for c in col if not c.startswith('ps_calc_')] ## Droping ps_cal_ vars",No,5,77.0
target_stack.shape,No,5,58.0
"score = classifier.evaluate(test_x, test_y, verbose=0)
print('valid loss:', score[0])
print('valid accuracy:', score[1])",No,5,49.0
"import pandas as pd
aa = pd.read_csv('submission_file.csv')
aa
",Yes,4,45.0
"import numpy as np 
import pandas as pd
from fastai.conv_learner import *
import os

# Input data files are available in the ""../input/"" directory.
# Any results you write to the current directory are saved as output.

PATH = ""data/dogscats/""",No,5,77.0
"# Image size, batch size and pretrained model architecture
sz=224
bs=20
arch=resnet50",No,5,77.0
"X_stack = pd.concat([train_stack[col_interval],train_stack[col_ordinal], pd.get_dummies(train_stack[col_binary])], axis=1)
test_stack_val = pd.concat([test_stack[col_interval],test_stack[col_ordinal], pd.get_dummies(test_stack[col_binary])], axis=1)
y_stack = target_stack",Yes,4,21.0
X_stack.shape,No,5,58.0
test_stack_val.shape,No,5,58.0
" X_stack =  X_stack.drop(columns=['gender_-1','facebook_profile_-1'], axis = 1)
",No,5,10.0
"#RandomForest params<br>rf_params = {}<br>rf_params[\'n_estimators\'] = 80<br>rf_params[\'max_depth\'] = 12<br>rf_params[\'min_samples_split\'] = 50<br>rf_params[\'min_samples_leaf\'] = 23<br>#rf_params[\'class_weight\'] = ""balanced_subsample""# ""balanced"" # ""balanced_subsample""<br>#rf_params[\'criterion\'] = 1<br>#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 10, \'min_samples_leaf\': 23, \'min_samples_split\': 88, \'n_estimators\': 66, \'random_state\': 584867}<br>#{\'class_weight\': 0, \'criterion\': 1, \'max_depth\': 3, \'min_samples_leaf\': 1, \'min_samples_split\': 15, \'n_estimators\': 31}<br>#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 7, \'min_samples_leaf\': 1, \'min_samples_split\': 25, \'n_estimators\': 52}",No,5,59.0
"# XGBoost params
xgb_params = {}
xgb_params['learning_rate'] =0.03660642032718193
xgb_params['n_estimators'] = 70
xgb_params['max_depth'] = 7
xgb_params['reg_alpha'] = 0.1
xgb_params['reg_lambda'] = 0.1
xgb_params['colsample_bytree'] = 0.6162725690461764 
xgb_params['min_child_weight'] =  0.751826989118936
#{'colsample_bytree': 0.6162725690461764, 'learning_rate': 0.07660642032718193, 'max_depth': 1, 'min_child_weight': 0.751826989118936, 'n_estimators': 51, 'random_state': 2943, 'reg_alpha': 8.447744027604217e-05, 'reg_lambda': 2.506380824011793e-05}
#{'colsample_bytree': 0.6669680642534331, 'learning_rate': 0.0027697150000431693, 'max_depth': 2, 'min_child_weight': 0.7842089630474731, 'n_estimators': 58, 'random_state': 194789, 'reg_alpha': 6.334122926125054e-05, 'reg_lambda': 7.725227814541321e-05}
#{'colsample_bytree': 0.7185209051997172, 'learning_rate': 0.09634564047154007, 'max_depth': 1, 'min_child_weight': 0.7765683660381831, 'n_estimators': 60, 'random_state': 1791482, 'reg_alpha': 1.5998181299665275e-05, 'reg_lambda': 9.446368653609355e-05}
#{'colsample_bytree': 0.785981949747911, 'learning_rate': 0.07697973917507268, 'max_depth': 0, 'min_child_weight': 0.7528834859046539, 'n_estimators': 48, 'random_state': 1038594, 'reg_alpha': 9.730513129698628e-05, 'reg_lambda': 9.804649087783435e-05}",No,5,59.0
"rf_model = RandomForestClassifier(**rf_params, random_state=584867)",No,5,4.0
"xgb_model = XGBClassifier(**xgb_params, random_state=2943)",No,5,4.0
log_model = LogisticRegression(random_state=29),No,5,4.0
"stack = Ensemble(n_splits=3,
        stacker = log_model,
        base_models = (rf_model, xgb_model))",No,5,4.0
"X_stack.fillna(-1, inplace = True)
test_stack_val.fillna(-1,inplace=True)
y_pred = stack.fit_predict(X_stack, target_stack, test_stack_val)",Yes,4,17.0
"sub = pd.DataFrame()
sub['ids'] = test_id
sub['prob'] = y_pred
sub.to_csv('stacked_main.csv', index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt # plotting graphs<br>import sklearn<br>%matplotlib inline<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>os.listdir(""../input"")<br># Any results you write to the current directory are saved as output.",No,5,88.0
category_id_df,No,5,53.0
"# The sorted function Converts dictionary items into a (sorted) list. 
# In subsequent steps - We will use this list to iterate over the categories
sorted(category_to_id.items())",No,5,9.0
"# Store the entries into the results dataframe and name its columns    
BBC Ncv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])",No,5,12.0
"# Create submission file: 2 colmuns with header id, label
submission = pd.DataFrame({'id':os.listdir(f'{PATH}test1'), 'label':label_probs})
submission['id'] = submission['id'].map(lambda x: x.split('.')[0])
submission['id'] = submission['id'].astype(int)
submission = submission.sort_values('id')
submission.to_csv('../working/submission.csv', index=False)",Yes,4,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt # plotting graphs<br>import sklearn<br>%matplotlib inline<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input/ai-academy-intermediate-class-competition-1""))<br><br># Any results you write to the current directory are saved as output.",No,4,88.0
"from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
",No,5,4.0
"# model.coef_ contains the importance of each feature for each category
model.coef_",No,5,79.0
test_df,No,5,41.0
test_df.head(),No,5,41.0
test_df.Text.tolist(),No,5,16.0
"#translating text column into a list
test_features = tfidf.transform(test_df.Text.tolist())

Y_pred = model.predict(test_features)

Y_pred",Yes,3,8.0
submission,No,5,41.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>from sklearn.feature_extraction.text import TfidfVectorizer<br>import matplotlib<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>data = pd.read_csv(""../input/ai-academy-intermediate-class-competition-1/BBC News Train.csv"")<br>data = data[[""Text"", ""Category""]]<br>data<br># Any results you write to the current directory are saved as output.",No,4,45.0
"vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm=\'l2\', encoding=\'latin-1\', ngram_range=(1, 2), stop_words=\'english\')<br>X = vectorizer.fit_transform(data[""Text""])<br>print(len(vectorizer.get_feature_names()))<br>print(X.shape)",Yes,3,8.0
"tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(data.Text).toarray()
print(features)
labels = data.category_id
print(labels)
features.shape",Yes,3,8.0
"from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=500, max_depth=4),
    MultinomialNB(),
    LogisticRegression(random_state=4),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
print(cv_df)
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  print(accuracies)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
print(entries)
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(cv_df)",Yes,3,3.0
"model = models[2]
model.fit(features, labels)
model.coef_",Yes,3,7.0
"test_data = pd.read_csv(""../input/bbc-test-3/BBC News Test.csv"")
test_data",No,5,45.0
"submission = pd.DataFrame({
    ""ArticleId"": test_data[""ArticleId""],
    ""Category"": submission
})
submission",No,5,55.0
"submission.to_csv('submission.csv', index=False)",No,5,25.0
"import numpy as np 
import pandas as pd 
import sklearn
import matplotlib.pyplot as plt",No,5,22.0
"train = pd.read_csv(""../input/train.csv"")",No,5,45.0
"Xtrain = train
Xtrain = Xtrain.drop(columns=[""median_house_value""])
Xtrain.head()",Yes,4,10.0
"from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor",No,5,22.0
"ridge = linear_model.Ridge(alpha = 0.5)
ridge.fit(Xtrain,Ytrain)
ridge.coef_",Yes,3,7.0
"lasso = linear_model.Lasso(alpha = 0.1)
lasso.fit(Xtrain,Ytrain)",Yes,4,7.0
"KNNRegression = KNeighborsRegressor(n_neighbors=52)
KNNRegression.fit(Xtrain, Ytrain) ",Yes,4,7.0
"test = pd.read_csv(""../input/test.csv"")
test.head()",Yes,4,45.0
"pred.to_csv(""prediction.csv"", index=False)",No,5,25.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input""))<br><br>FOLDER = ""../input/""<br># Any results you write to the current directory are saved as output.",Yes,3,45.0
"import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn import linear_model, ensemble, neighbors
import numpy as np
import sympy as sp
import pandas as pd
from pylab import rcParams
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import f_regression

plt.rcParams['figure.figsize'] = [20, 10]
%matplotlib inline",Yes,4,22.0
dataset = pd.read_csv(FOLDER+'train.csv'),No,5,45.0
dataset.head(),No,5,41.0
dataset.mean(),No,5,40.0
dataset.std()/dataset.mean(),No,5,40.0
"def add_features(dset):
    
    mean_houses = pd.Series(dset['households']/dset['population'], name = 'mean_households')
    rooms_ratio = pd.Series(dset['total_rooms']/dset['total_bedrooms'], name = 'ratio' )
    
    return pd.concat([mean_houses, rooms_ratio, dset], axis = 1)",No,5,8.0
"test = pd.read_csv(FOLDER+'test.csv')
test.head()",Yes,4,45.0
"import numpy as np
import pandas as pd
import sklearn
from matplotlib import pyplot as plt
from scipy import stats as st
import os
import matplotlib.colors as mcolors",No,5,22.0
"from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score",No,5,22.0
"trainfilepath = ""../input/atividade-3-pmr3508/train.csv""
testfilepath = ""../input/atividade-3-pmr3508/test.csv""",No,5,77.0
"trainHouses = pd.read_csv(trainfilepath, sep=r'\\s*,\\s*', engine='python', na_values='?')<br>testHouses = pd.read_csv(testfilepath, sep=r'\\s*,\\s*', engine='python', na_values='?')",No,5,45.0
"states = pd.read_csv(""../input/averagestatecoordinates/states.csv"", sep=r\'\\s*,\\s*\', engine=\'python\', na_values=\'?\')<br>states.head()",Yes,4,45.0
"from sklearn.neighbors import KNeighborsClassifier<br><br>knn = KNeighborsClassifier(n_neighbors=3, p=2)<br><br>#knn.fit(states.drop(""state"", axis=""columns""), states[\'state\'])<br>knn.fit(states[[""latitude"", ""longitude""]], states[\'state\'])<br><br><br>#trainHouses[""state""] = knn.predict(trainHouses.drop(trainHouses.columns.drop([""latitude"", ""longitude""]), axis=""columns""))<br>#prediction = knn.predict(trainHouses.drop(trainHouses.columns.drop([""latitude"", ""longitude""]), axis=""columns""))<br>prediction = knn.predict(trainHouses[[""latitude"", ""longitude""]])<br><br>#print(prediction.value_counts())<br>print(np.unique(prediction,return_counts=True))",Yes,3,7.0
"apple = (37.33182, -122.03118)
SF = (37.783333, -122.416667)

#trainHouses[""distance_to_apple""] = np.sqrt((trainHouses[""latitude""] - miami[0])**2 + (trainHouses[""longitude""] - miami[1])**2)
#d_to_apple = np.sqrt((trainHouses[""latitude""] - apple[0])**2 + (trainHouses[""longitude""] - apple[1])**2)
d_to_apple = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], apple[0], apple[1])
#d_to_SF = np.sqrt((trainHouses[""latitude""] - SF[0])**2 + (trainHouses[""longitude""] - SF[1])**2)
d_to_SF = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], SF[0], SF[1])

plt.hist(d_to_apple, bins=100)
plt.show()
plt.hist2d(d_to_apple, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15))
plt.show()
plt.hist(d_to_SF, bins=100)
plt.show()
plt.hist2d(d_to_SF, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15))
plt.show()",Yes,4,33.0
"LA = (35.0569, -118.25)
Beverly_Hills = (34.073056, -118.399444)

#d_to_LA = np.sqrt((trainHouses[""latitude""] - LA[0])**2 + (trainHouses[""longitude""] - LA[1])**2)
d_to_LA = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], LA[0], LA[1])
#d_to_BH = np.sqrt((trainHouses[""latitude""] - Beverly_Hills[0])**2 + (trainHouses[""longitude""] - Beverly_Hills[1])**2)
d_to_BH = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], Beverly_Hills[0], Beverly_Hills[1])

plt.hist(d_to_LA, bins=100)
plt.show()
plt.hist2d(d_to_LA, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15))
plt.show()

plt.hist(d_to_BH, bins=100)
plt.show()
plt.hist2d(d_to_BH, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15))
plt.show()",Yes,4,33.0
"trainHouses[""distance_to_SF""] = d_to_SF
#testHouses[""distance_to_SF""] = np.sqrt((testHouses[""latitude""] - SF[0])**2 + (testHouses[""longitude""] - SF[1])**2)
testHouses[""distance_to_SF""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], SF[0], SF[1])
trainHouses[""distance_to_LA""] = d_to_LA
#testHouses[""distance_to_LA""] = np.sqrt((testHouses[""latitude""] - LA[0])**2 + (testHouses[""longitude""] - LA[1])**2)
testHouses[""distance_to_LA""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], LA[0], LA[1])
trainHouses[""distance_to_state_center""] = d_to_center
#testHouses[""distance_to_state_center""] = np.sqrt((testHouses[""latitude""] - lat)**2 + (testHouses[""longitude""] - long)**2)
testHouses[""distance_to_state_center""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], lat, long)
trainHouses[""distance_to_beverly_hills""] = d_to_BH
testHouses[""distance_to_beverly_hills""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], Beverly_Hills[0], Beverly_Hills[1])",No,5,8.0
trainHouses.shape,No,5,58.0
trainHouses.head(),No,5,41.0
"plt.hist(trainHouses[""median_house_value""], bins=100)
plt.show()",No,5,33.0
"XtrainHouses = trainHouses.drop([""Id"", ""median_house_value""], axis=""columns"")
XtrainHousesB = trainHouses.drop([""Id"", ""median_house_value"", ""median_age"", ""total_rooms"", ""total_bedrooms"", ""population"", ""households""], axis=""columns"")
YtrainHouses = trainHouses[""median_house_value""]

XtestHouses = testHouses.drop(""Id"", axis=""columns"")
XtestHousesB = testHouses.drop([""Id"", ""median_age"", ""total_rooms"", ""total_bedrooms"", ""population"", ""households""], axis=""columns"")",Yes,4,10.0
"#a = trainHouses[trainHouses[""median_house_value""].transform(lambda x: x<=500000)]
#a = trainHouses[trainHouses[""median_house_value""] <=500000]
plt.hist(trainHouses[trainHouses[""median_house_value""] <=500000][""median_house_value""], bins=100)
plt.show()
culledTrainHouses = trainHouses[trainHouses[""median_house_value""] <=500000]
XculledTrainHouses = culledTrainHouses.drop([""Id"", ""median_house_value""], axis=""columns"")
YculledTrainHouses = culledTrainHouses.median_house_value
",Yes,3,33.0
"import pandas as pd
import sklearn
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error 
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
",Yes,4,22.0
"train_data = pd.read_csv('../input/train.csv',
        engine='python')

test_data = pd.read_csv('../input/test.csv',
        engine='python')",No,5,45.0
"train_data.head()
",No,5,41.0
train_data.shape,No,5,58.0
train_data.info(),No,5,40.0
"Xtrain = train_data
Xtrain = Xtrain.drop('Id', axis=1)
Xtrain = Xtrain.drop('median_house_value', axis=1)
Ytrain = train_data['median_house_value']",Yes,4,21.0
"knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(Xtrain, Ytrain)
scores = cross_val_score(knn, Xtrain, Ytrain, cv=10)
Ypred = knn.predict(Xtrain)
scores.mean()",Yes,3,7.0
"rmsle(Ytrain, Ypred)",No,5,28.0
"reg = linear_model.Lasso(alpha = 0.1)
reg.fit(Xtrain, Ytrain)
scores = cross_val_score(reg, Xtrain, Ytrain, cv=10)
Ypred=reg.predict(Xtrain)
scores.mean()",Yes,3,7.0
"rid = linear_model.Ridge(alpha = 0.5)
rid.fit(Xtrain, Ytrain)
scores = cross_val_score(rid, Xtrain, Ytrain, cv=10)
Ypred = rid.predict(Xtrain)
scores.mean()",Yes,3,7.0
"bay = linear_model.BayesianRidge()
bay.fit(Xtrain, Ytrain)
scores = cross_val_score(bay, Xtrain, Ytrain, cv=10)
Ypred = bay.predict(Xtrain)
scores.mean()",Yes,3,7.0
"lars = linear_model.LassoLars(alpha = 0.1)
lars.fit(Xtrain, Ytrain)
scores = cross_val_score(lars, Xtrain, Ytrain, cv=10)
Ypred = lars.predict(Xtrain)
scores.mean()",Yes,3,7.0
"california_sea=[(41.990352, -124.216535),(41.936725, -124.199048),(41.862157, -124.220161),(41.758672, -124.240793),(41.730317, -124.162807),(41.672629, -124.139878),(41.722746, -124.151351),(41.671813, -124.136762),(41.618963, -124.109252),(41.470737, -124.072740),(41.383226, -124.066948),(41.308172, -124.094492),(41.212278, -124.121904),
(41.137176, -124.165918),(41.062165, -124.165618),(41.020596, -124.115740),(40.928851, -124.143028),(40.858028, -124.126245),(40.812048, -124.181163),(40.728511, -124.235831),(40.649059, -124.301387),(40.586325, -124.344954),(40.511043, -124.388365),(40.440002, -124.409806),(40.395399, -124.383960),(40.322914, -124.349643),(40.241803, -124.337706),(40.186635, -124.253402),(40.122885, -124.169203),(40.067673, -124.068499),(40.008009, -124.029231),
(39.922813, -123.945453),(39.837566, -123.873007),(39.735216, -123.828474),(39.654186, -123.789622),(39.564619, -123.761930),(39.399528, -123.821626),(39.201588, -123.770073),(39.076989, -123.691566),(38.960637, -123.724138),(38.879044, -123.662811),(38.754580, -123.507611),(38.634199, -123.386034),(38.496411, -123.193367),(38.336876, -123.061865),(38.259117, -122.974368),
(38.151338, -122.952917),(38.060918, -122.980669),(37.996318, -123.002792),
(38.026254, -122.926130),(38.004306, -122.827828),(37.931906, -122.744687),(37.902923, -122.652017),(37.872444, -122.594173),(37.880984, -122.392446),
(37.815555, -122.367515),(37.628327, -122.331577),(37.542968, -122.455670),
(37.370235, -122.414093),(37.290236, -122.415691),(37.167091, -122.356855),
(37.088046, -122.276348),(36.987005, -122.157357),(36.951905, -122.049790),
(36.969554, -121.914753),(36.925477, -121.862435),(36.824092, -121.802024),
(36.620740, -121.851334),(36.480625, -121.934216),(36.282719, -121.866908),
(36.162592, -121.678018),(35.990860, -121.498031),(35.827849, -121.382193),
(35.671399, -121.272296),(35.608589, -121.143265),(35.453082, -120.919491),
(35.297750, -120.877400),(35.189759, -120.819107),(35.180890, -120.736397),
(35.097645, -120.628863),(34.932680, -120.660285),(34.842040, -120.610177),
(34.742216, -120.618143),(34.583391, -120.639685),(34.528043, -120.518413),
(34.457687, -120.472919),(34.458791, -120.347644),(34.469789, -120.138306),
(34.422313, -119.903627),(34.399196, -119.699791),(34.408922, -119.552255),(34.335795, -119.408499),(34.288024, -119.329889),(34.199208, -119.247261),(34.115993, -119.153777),(34.041474, -118.899965),(34.035682, -118.855901),(34.018486, -118.822894),(34.003602, -118.805037),(34.016106, -118.785710),
(34.029683, -118.744327),(34.037409, -118.667109),(34.036912, -118.580005),
(34.009365, -118.502919),(33.984242, -118.472597),(33.960222, -118.454035),(33.867022, -118.402873),(33.810913, -118.390523),(33.770287, -118.420867),(33.716625, -118.060214),(33.606537, -117.889392),(33.385674, -117.578771),(33.270497, -117.443285),(33.127431, -117.326314),(33.053581, -117.291643),(32.831417, -117.277875),(32.683026, -117.189643),(32.536805, -117.122224)]",No,5,77.0
"train_data3 = train_data2<br>train_data3[""rooms_per_household""] = train_data3[""total_rooms""]/train_data3[""households""]<br>train_data3[""bedrooms_per_room""] = train_data3[""total_bedrooms""]/train_data3[""total_rooms""]<br>train_data3[""population_per_household""] = train_data3[""population""]/train_data3[""households""]<br>train_data3[""income_per_person""] = train_data3[""median_income""]/train_data3[""population_per_household""]<br>train_data3[\'mean_rooms\'] = train_data3[\'total_rooms\']/train_data3[\'households\']<br>train_data3[\'rooms_per_person\'] = train_data3[\'total_rooms\']/train_data3[\'population\']<br>train_data3[\'mean_bedrooms\'] = train_data3[\'total_bedrooms\']/train_data3[\'households\']<br>train_data3[\'bedrooms_per_person\'] = train_data3[\'total_bedrooms\']/train_data3[\'households\']<br>train_data3[\'persons_per_household\'] = train_data3[\'population\']/train_data3[\'households\']<br>train_data3[\'total_income\'] = train_data3[\'median_income\']*train_data3[\'households\']",No,5,8.0
"d = 8
n = 100
        
boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=d),
                          n_estimators=n)        
boost.fit(Xtrain3, Ytrain)

Ypred = boost.predict(Xtrain3)

boost_rmsle = rmsle(Ytrain, Ypred)

print('error =', boost_rmsle)",Yes,3,7.0
"from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(Xtrain3, Ytrain, train_size=0.7)


from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=200, depth=6, learning_rate=0.2, loss_function='RMSE')
model.fit(Xtrain3, Ytrain,eval_set=(X_validation, y_validation),plot=True)
Ypred = model.predict(Xtrain3)
model_rmsle = rmsle(Ytrain, Ypred)
print('error =', model_rmsle)",Yes,2,7.0
"Xtest2 = test_data2.drop('Id', axis=1)",No,5,10.0
"forest = RandomForestRegressor(max_depth=21, random_state=0, n_estimators=1000)
forest.fit(Xtrain3, Ytrain)

Ypred = forest.predict(Xtrain3)
forest_rmsle = rmsle(Ytrain, Ypred)

print('log error =', forest_rmsle)

prediction = forest.predict(Xtest2)",Yes,3,7.0
"import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

import os
print(os.listdir(""../input""))
",No,5,88.0
"train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")
Xtrain = train.drop(columns=[""Id"",""median_house_value""])
Ytrain = train[""median_house_value""]
",Yes,3,45.0
Xtrain.describe(),No,5,40.0
YPredict.describe(),No,5,40.0
"pd.DataFrame({""Id"":ID_list,""median_house_value"":YPredict}).to_csv(""pred_R.csv"",index=False)",No,5,25.0
"import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"data = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"")
data.head()",Yes,4,45.0
data.shape,No,5,58.0
"newData = data.drop([""Id"",""latitude"",""longitude""],axis=1)",No,5,10.0
"newData[""avg_rooms""] = newData.total_rooms/newData.households
newData[""avg_bedrooms""] = newData.total_bedrooms/newData.households
newData[""avg_inhabitants""] = newData.population/newData.households",No,5,8.0
"from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV",No,5,22.0
"knn = KNeighborsRegressor()
knn.get_params()",Yes,4,4.0
"param_grid = {""n_neighbors"":[i for i in range(1,31)],""weights"":[""uniform"",""distance""],""p"":[1,2]}
grid = GridSearchCV(knn,param_grid,cv=10)",Yes,4,5.0
"grid.fit(xTrain,yTrain)
print(grid.best_estimator_)
print(grid.best_score_)",No,5,6.0
"ridge = Ridge()
ridge.get_params()",Yes,4,4.0
"param_grid2 = {""alpha"":np.linspace(0.5,10.5,101).tolist()} 
grid2 = GridSearchCV(ridge,param_grid2,cv=10)",Yes,4,5.0
"grid2.fit(xTrain,yTrain)
print(grid2.best_estimator_)
print(grid2.best_score_)",No,5,6.0
from sklearn.linear_model import Lasso,No,5,22.0
"lasso = Lasso()
lasso.get_params()",Yes,4,4.0
"param_grid3 = {""alpha"":np.linspace(0.5,5.5,51).tolist(),""normalize"":[True,False]}
grid3 = GridSearchCV(lasso,param_grid3,cv=10)",Yes,4,5.0
"grid3.fit(xTrain,yTrain)
print(grid3.best_estimator_)
print(grid3.best_score_)",No,5,6.0
"testRaw = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"")
ID_list = testRaw.Id.tolist()
testRaw[""avg_rooms""] = testRaw.total_rooms/testRaw.households
testRaw[""avg_bedrooms""] = testRaw.total_bedrooms/testRaw.households
testRaw[""avg_inhabitants""] = testRaw.population/testRaw.households
testData = testRaw.drop([""Id"",""latitude"",""longitude""],axis=1)
testData.head()",Yes,3,8.0
"knn.fit(xTrain,yTrain)
pred_knn = knn.predict(testData).tolist()",Yes,4,7.0
"pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_knn}).to_csv(""pred_knn.csv"",index=False)",No,5,25.0
"ridge.fit(xTrain,yTrain)
pred_ridge = ridge.predict(testData).tolist()",Yes,4,7.0
"pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_ridge}).to_csv(""pred_ridge.csv"",index=False)",No,5,25.0
"lasso.fit(xTrain,yTrain)
pred_lasso = lasso.predict(testData).tolist()",Yes,4,7.0
"pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_lasso}).to_csv(""pred_lasso.csv"",index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import distance
from geopy.distance import vincenty
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier",No,5,22.0
"Train = pd.read_csv(""../input/californianhouses/train.csv"",<br>          sep=r\'\\s*,\\s*\',<br>          engine=\'python\',<br>          na_values="""")<br>Test = pd.read_csv(""../input/californianhouses/test.csv"",<br>         sep=r\'\\s*,\\s*\',<br>         engine=\'python\',<br>         na_values="""")",No,5,45.0
Train.head(),No,5,41.0
# Id no  um critrio para estimar a varivel desejada<br>Train = Train.drop(columns=['Id']),No,5,10.0
plt.title('Matriz de correlao')<br>sns.heatmap(mcorr),No,5,80.0
"scaler = MinMaxScaler()
selected_columns = ['median_income', 'total_rooms','population','median_age']
SC = scaler.fit_transform(Train[selected_columns])
x_train, x_test, y_train, y_test = train_test_split(SC, Train['median_house_value'], test_size=0.20)",Yes,4,18.0
"def rmsle(y_test, y_pred):
    return np.sqrt(np.mean((np.log(y_pred+1) - np.log(y_test+1))**2))

reg = LinearRegression()
scorer = make_scorer(rmsle, greater_is_better=False)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0
"param_grid = dict(n_neighbors=list(range(1,15)))
neigh = KNeighborsClassifier()
grid_obj = GridSearchCV(neigh, param_grid, scoring=scorer, cv=5)
grid_obj.fit(x_train, y_train)
grid_obj.best_params_",Yes,3,7.0
"neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)
print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0
"las = Lasso()
param_grid = dict(alpha=np.divide(list(range(1,100)),100))
grid_obj = GridSearchCV(las, param_grid, scoring=scorer, cv=5)
grid_obj.fit(x_train, y_train)
grid_obj.best_params_",Yes,2,7.0
"las = Lasso(alpha=0.21)
las.fit(x_train, y_train)
y_pred = las.predict(x_test)
print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0
"rfc = RandomForestClassifier(n_estimators=50, max_depth=35, random_state=0)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
print(""RMSLE: "" + str(rmsle(y_pred, y_test)))

",Yes,3,7.0
"dfTest = Test.drop(['longitude', 'latitude', 'households', 'total_bedrooms'], axis=1)
dfTest.head()",Yes,4,10.0
dfTest.shape,No,5,58.0
"selected_model = rfc
x_val_test = scaler.transform(dfTest[selected_columns])
y_val_test = selected_model.predict(x_val_test)
dfSave = pd.DataFrame(data={""Id"" : dfTest[""Id""], ""median_house_value"" : y_val_test})
pd.DataFrame(dfSave[[""Id"", ""median_house_value""]], columns = [""Id"", ""median_house_value""]).to_csv(""Output.csv"", index=False)
",Yes,3,25.0
# Importao das bibliotecas <br>import numpy as np<br>import pandas as pd<br>import matplotlib.pyplot as plt<br>%matplotlib inline<br>import warnings <br>import math<br>import seaborn as sns<br>,No,5,23.0
"#Lendo a base de treino<br>traindata = pd.read_csv(""../input/california-houses/train.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")",No,5,45.0
"#Modificando a visalizao da base de treino<br>traindata.iloc[0:20,:]",No,5,14.0
traindata = traindata.drop(columns = ['Id']),No,5,10.0
traindata.info(),No,5,40.0
"# Grficos de barra das variveis <br>traindata.hist(bins=50, figsize=(20,20))<br>plt.show()",No,5,33.0
"plt.figure(figsize=(6,6))<br>plt.title('Matriz de correlao')<br>sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",No,5,80.0
"knn_trainX = traindata[[""longitude"",""total_rooms"",""total_bedrooms"",""population"",""households"",""median_income""]]
",No,5,12.0
"from sklearn.neighbors import KNeighborsRegressor
neighbor = KNeighborsRegressor(n_neighbors=2)
neighbor.fit(knn_trainX,trainY) 
knn_predict = neighbor.predict(knn_trainX)
df_knn = pd.DataFrame({'Y_real':trainY[:],'Y_pred':knn_predict[:]})
print(rmsle(df_knn.Y_real,df_knn.Y_pred))",Yes,2,7.0
"def rooms_pop(row):<br>    row['rooms_pop'] = row['total_rooms'] / row['population'] <br>    return row<br>traindata = traindata.apply(rooms_pop, axis=1)<br>traindata = traindata.drop(['population'], axis=1)<br>plt.figure(figsize=(6,6))<br>plt.title('Matriz de correlao')<br>sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",Yes,3,80.0
"def age_rooms(row):<br>    row['age_rooms'] = row['median_age'] / row['total_rooms'] <br>    return row<br>traindata = traindata.apply(age_rooms, axis=1)<br>traindata = traindata.drop(['median_age'], axis=1)<br>plt.figure(figsize=(6,6))<br>plt.title('Matriz de correlao')<br>sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",Yes,3,80.0
"traindata = traindata.drop(columns = ['latitude','longitude'])",No,5,10.0
"newknn_trainX = traindata[[""per_capita"",""total_bedrooms"",""rooms_pop"",""households"",""age_rooms""]]<br>neighbor = KNeighborsRegressor(n_neighbors=2)<br>neighbor.fit(newknn_trainX,trainY) <br>knn_predict = neighbor.predict(newknn_trainX)<br>df_knn = pd.DataFrame({\'Y_real\':trainY[:],\'Y_pred\':knn_predict[:]})<br>print(rmsle(df_knn.Y_real,df_knn.Y_pred))",Yes,2,7.0
"x_val_test = testX<br>y_val_test = neighbor.predict(x_val_test)<br><br>dfSave = pd.DataFrame(data={""Id"" : testdata[""Id""], ""median_house_value"" : y_val_test})<br>dfSave[\'Id\'] = dfSave[\'Id\'].astype(int)<br>pd.DataFrame(dfSave[[""Id"", ""median_house_value""]], columns = [""Id"", ""median_house_value""]).to_csv(""Output.csv"", index=False)<br>dfSave.head()",Yes,3,25.0
"%matplotlib inline<br><br>from matplotlib import pyplot as plt<br>import pandas as pd<br>import numpy as np<br><br>import os<br><br>df = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")<br>df = df.replace(np.nan,\' \', regex=True)",No,3,45.0
df.describe(),No,5,40.0
"X[""coord""] = X[""latitude""].map(str) + ', ' + X[""longitude""].map(str)",No,5,8.0
"cities = pd.DataFrame(address.Id)<br>cities = cities.set_index(\'Id\')<br>cities[\'city\'] = \'\'<br><br>for index, row in address.iterrows():<br>    city = row[\'location\'][-2]<br>    row_id = row[\'Id\']<br>    cities.at[row_id, \'city\'] = city.lstrip(\' \')<br><br>cities.to_csv(""cities.csv"", index=False)",Yes,4,25.0
"cities = pd.read_csv(""../input/california-prices/cities.csv"",<br>          sep=r\'\\s*""\\s*\',<br>          engine=\'python\')<br>X[\'cities\'] = cities[\'city\']",Yes,4,45.0
"calPrices = pd.read_csv(""../input/california-prices/calif.csv"",<br>        sep=r\'\\s*,\\s*\\s*""\',<br>        engine=\'python\')<br><br><br>calPrices = calPrices.replace(\'---\',\' \', regex=True)<br>calPrices.columns = [col.replace(\'""\', \'\') for col in calPrices.columns]<br>calPrices = calPrices.filter([\'Region Name\', \'Current\'])<br>calPrices = calPrices.replace(\'""\',\'\', regex=True)<br>calPrices = calPrices.replace(\'\\$\',\'\', regex=True)<br>calPrices = calPrices.replace(\',\',\'\', regex=True)<br><br>calPrices = calPrices.drop(calPrices.index[0])",Yes,4,78.0
calPrices.head(),No,5,41.0
"X['people_pb'] = X.population/X.total_bedrooms
X['people_ph'] = X.population/X.households
X['income_pr'] = X.median_income/X.total_rooms",No,5,8.0
"X = X.replace('', np.NaN)
X = X.replace(' ', np.NaN)
X = X.dropna()
X = X.drop(['Id','longitude','latitude','coord'], axis = 1)",Yes,4,17.0
"X_train = X.filter(['median_age', 'total_rooms','total_bedrooms', 'population', 'households', 'median_income', 'city_price', 'people_pb','people_ph', 'income_pr'], axis = 1)
X_train.describe()",Yes,4,14.0
"from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)",No,5,18.0
"from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

reg = linear_model.LinearRegression().fit(X_train_scaled, Y)
reg.score(X_train_scaled, Y)",Yes,4,7.0
"from sklearn import neighbors

knn = neighbors.KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train_scaled, Y)
knn_scores = cross_val_score(knn, X_train_scaled, Y, cv=10)

np.mean(knn_scores)",Yes,3,7.0
"from sklearn import ensemble

params = {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
gbr = ensemble.GradientBoostingRegressor(**params)

gbr.fit(X_train_scaled, Y)",Yes,4,7.0
"Yreg = reg.predict(X_train_scaled)

Yknn = knn.predict(X_train_scaled)

Ygbr = gbr.predict(X_train_scaled)",No,5,27.0
"from sklearn.metrics import mean_squared_log_error

mean_squared_log_error(Y, Yknn) ",No,5,28.0
"mean_squared_log_error(Y, Ygbr) ",No,5,28.0
"testdf = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"",<br>        sep=r\'\\s*,\\s*\',<br>        engine=\'python\',<br>        na_values=""?"")<br>testdf = testdf.replace(np.nan,\' \', regex=True)",Yes,4,45.0
"testdf[""coord""] = testdf[""latitude""].map(str) + \', \' + testdf[""longitude""].map(str)",No,5,8.0
"texts = [""Hooli stock price soared after a dip in PiedPiper revenue growth."",<br>         ""Captain Tsubasa scores a magnificent goal for the Japanese team."",<br>         ""Merryweather mercenaries are sent on another mission, as government oversight groups call for new sanctions."",<br>         ""Beyonc releases a new album, tops the charts in all of south-east Asia!"",<br>         ""You won\'t guess what the latest trend in data analysis is!""]<br>text_features = tfidf.transform(texts)<br>predictions = model.predict(text_features)<br>for text, predicted in zip(texts, predictions):<br>  print(\'""{}""\'.format(text))<br>  print(""  - Predicted as: \'{}\'"".format(id_to_category[predicted]))<br>  print("""")",Yes,4,8.0
"test_loc = pd.read_csv(""../input/california-prices/test_loc.csv"",<br>          sep=r\'\\s*""\',<br>          engine=\'python\')<br><br>test_loc.columns = [col.replace(\'""\', \'\') for col in test_loc.columns]<br>test_loc = test_loc.replace(\'""\',\'\', regex=True)<br><br>test_loc[\'location\'] = test_loc[\'location\'].str.split("","")<br><br>test_loc = test_loc.replace(\',\',\'\', regex=True)",Yes,4,78.0
"#import os
print(os.listdir(""../input/bbc-test""))",No,5,88.0
"TEST_PATH = os.path.join(""../input/bbc-test"", ""BBC News Test (1).csv"")

#Load the data using pandas : Create a DataFrame
test_df = pd.read_csv(TEST_PATH)

",No,5,45.0
"test_cities = pd.DataFrame(test_loc.Id)<br>test_cities = test_cities.set_index(\'Id\')<br>test_cities[\'city\'] = \'\'<br><br>for index, row in test_loc.iterrows():<br>    row_id = row[\'Id\']<br>    city = row[\'location\'][-2]<br>    if city != \'n\':<br>        test_cities.at[row_id, \'city\'] = city.lstrip(\' \')<br><br>test_cities.to_csv(""test_cities.csv"", index=False)",Yes,4,25.0
"test_cities = pd.read_csv(""../input/california-prices/test_cities.csv"",<br>          engine=\'python\')<br><br>testdf[\'cities\'] = test_cities[\'city\']<br>",Yes,4,45.0
"import pandas as pd
import numpy",No,5,22.0
"data = pd.read_csv(""../input/datasetss/train.csv"")",No,5,45.0
"X_test = testdf.copy()

X_test['people_pb'] = X_test.population/X_test.total_bedrooms
X_test['people_ph'] = X_test.population/X_test.households
X_test['income_pr'] = X_test.median_income/X_test.total_rooms

X_test = X_test.filter(['median_age','total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'city_price', 'people_pb','people_ph', 'income_pr'], axis = 1)
X_test.describe()",Yes,3,8.0
data,No,5,41.0
X_test_scaled = scaler.transform(X_test),No,5,18.0
Ypred = knn.predict(X_test_scaled),No,5,48.0
"prediction.to_csv(""prediction.csv"", index=False)",No,5,25.0
"data = data.drop('Id', axis=1)",No,5,10.0
"import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import cross_val_score as cvs",No,5,22.0
data.isna().sum(),No,5,39.0
"arquivo1 = '../input/test.csv'
tester = pd.read_csv(arquivo1, engine = 'python')
tester.shape",Yes,4,45.0
"X_train = data.drop('median_house_value', axis=1)
y_train = data.median_house_value",No,5,21.0
"from sklearn import linear_model
from sklearn.model_selection import cross_val_score",No,5,22.0
"arquivo2 = '../input/train.csv'
trainer = pd.read_csv(arquivo2, engine = 'python')
trainer.shape",Yes,4,45.0
trainer.head(),No,5,41.0
"best_knr = KNR(n_neighbors=melhor_knr(train,5,50)[1])",No,4,4.0
"best_knr.fit(train,trainer['median_house_value'])",No,5,7.0
knr_pred = best_knr.predict(test),No,5,48.0
"Submit1 = pd.DataFrame()
Submit1.insert(0, 'Id', tester['Id'])
Submit1.insert(1,'median_house_value', knr_pred)",Yes,4,12.0
best_tree = DTR(max_depth=10),No,5,4.0
"best_tree.fit(train,target)",No,5,7.0
"tree_pred = best_tree.predict(test)
tree_pred",No,5,48.0
"lcv = LassoCV().fit(train, target)
lcv.score(train, target)",Yes,4,7.0
"lasso = Lasso(max_iter = 100000, selection = 'random')
lasso.fit(train, target)
pred_lasso = lasso.predict(test)
pred_lasso",Yes,3,7.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>from scipy.sparse import csr_matrix<br>from tqdm import tqdm<br>from sklearn.datasets import load_svmlight_file<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br><br>import os<br>print(os.listdir(""../input/movie-ratings""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"training_labels[training_labels == 0] = -1
testing_labels[testing_labels == 0] = -1",No,5,8.0
"from sklearn.model_selection import cross_val_score, GridSearchCV
clf = GridSearchCV(AveragePerceptron(), param_grid={'learning_rate':[1, 0.1, 0.01], 'margin':[ 0, 0.1], 'decay': [False, True], 'epochs':[50], 'avg_decay':[False, True]}, cv=10, scoring='accuracy', n_jobs=-1)
clf.fit(testing_data, testing_labels)",No,5,6.0
"clf.best_estimator_.score(testing_data, testing_labels)",No,5,49.0
"with open('submission.csv', 'w') as submission:<br>    with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:<br>        submission.write('example_id,label\<br>')<br>        for example_id, label in zip(example_ids, submission_pred):<br>            submission.write('{},{}\<br>'.format(example_id.strip(), int(label)))",No,4,25.0
"%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from fastai.structured import *
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from IPython.core.debugger import set_trace
from sklearn.model_selection import KFold

import os
print(os.listdir(""../input""))
PATH = ""../input/""",No,4,88.0
"df_train = pd.read_csv(f'{PATH}train.csv', parse_dates=['Open Date'])
df_test = pd.read_csv(f'{PATH}test.csv', parse_dates=['Open Date'])

df_joined = pd.concat([df_train.drop('revenue', axis=1), df_test], axis=0)",Yes,4,45.0
"X_train, X_test = prcs(df_joined.copy())
y_train = df_train['revenue'].copy().apply(np.log)",No,5,21.0
"m = RandomForestRegressor(n_jobs=-1, n_estimators=150, oob_score=True, max_features=0.5)
m.fit(X_train, y_train)
score(m,X_train, y_train)",Yes,4,7.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.exp(predict(m, X_test)))
df_preds.to_csv('submission0.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.exp(predict(models, X_test)))
df_preds.to_csv('submission1.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"X_train, X_test = prcs(df_joined.copy(), fe=['id'])<br><br># Doble transformacin para que la distribucin sea Normal<br>y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test))))
df_preds.to_csv('submission2.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies'])<br><br># Doble transformacin para que la distribucin sea Normal<br>y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test))))
df_preds.to_csv('submission3.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies', 'city'])<br><br># Doble transformacin para que la distribucin sea Normal<br>y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test))))
df_preds.to_csv('submission4.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies', 'city', 'city_group'])<br><br># Doble transformacin para que la distribucin sea Normal<br>y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0
"df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test))))
df_preds.to_csv('submission5.csv', index=True, index_label='Id')
df_preds.head()",Yes,4,25.0
"import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"t = pd.read_csv('../input/training/training.csv')
ts = pd.read_csv('../input/test/test.csv')",No,5,45.0
t.shape[0],No,5,58.0
"Y = np.array(t.drop('Image', axis=1).fillna(method='ffill'),dtype=float)",No,5,21.0
"#import packages
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPool2D,Flatten, LeakyReLU
#from keras.layers import LeakyReLU(alpha=0.3) as activation",No,5,22.0
"# Set the CNN model <br># my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out<br><br>model = Sequential()<br><br>model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = \'Same\', <br>                 activation =\'linear\', input_shape = (96,96,1)))<br>model.add(LeakyReLU(alpha=.001))<br>model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))<br>model.add(Dropout(0.25))<br><br>model.add(Flatten())<br>model.add(Dense(256, activation = ""linear""))<br>model.add(LeakyReLU(alpha=.001))<br>model.add(Dropout(0.5))<br>model.add(Dense(128, activation = ""linear""))<br>model.add(LeakyReLU(alpha=.001))<br>model.add(Dropout(0.5))<br>model.add(Dense(30))",No,5,84.0
"model.fit(X, Y, epochs=100, batch_size=128,validation_split = 0.2)",No,5,7.0
Y_ts = model.predict(X_ts),No,5,48.0
"look_id = pd.read_csv('../input/IdLookupTable.csv')
look_id.drop('Location',axis=1,inplace=True)",Yes,4,45.0
look_id['location_id'] = look_id.FeatureName.map(maps),No,5,20.0
"look_id[['RowId','Location']].to_csv('Sub1.csv',index=False)",No,5,25.0
"%matplotlib inline
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from skimage.io import imread
import seaborn as sns # nice visuals
from sklearn.model_selection import train_test_split # splitting data
# quantifying models
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix
data_dir = '../input/'",No,5,77.0
"trn_image, vld_image, trn_label , vld_label = train_test_split(full_train_df['image'], 
                                               full_train_df['opacity'],
                                               test_size=0.25,
                                               random_state=2018)
trn_image = np.stack(trn_image, 0)
vld_image = np.stack(vld_image, 0)",Yes,3,13.0
"out_model = models.Sequential()
out_model.add(layers.Reshape((64, 64, 1), input_shape=trn_image.shape[1:]))
out_model.add(layers.Conv2D(16, (3, 3), padding='valid', activation='relu'))
out_model.add(layers.MaxPool2D((2, 2)))
out_model.add(layers.Conv2D(32, (3, 3), padding='valid', activation='relu'))
out_model.add(layers.MaxPool2D((2, 2)))
out_model.add(layers.Conv2D(64, (3, 3), padding='valid', activation='relu'))
out_model.add(layers.MaxPool2D((2, 2)))
out_model.add(layers.Conv2D(128, (3, 3), padding='valid', activation='relu'))
out_model.add(layers.MaxPool2D((2, 2)))
out_model.add(layers.GlobalAveragePooling2D())
out_model.add(layers.Dense(32, activation='relu'))
out_model.add(layers.Dense(1, activation='sigmoid'))
out_model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['binary_accuracy'])
out_model.summary()",Yes,4,4.0
"from IPython.display import clear_output
fit_results = out_model.fit(trn_image, trn_label, 
                            validation_data=(vld_image, vld_label),
                            epochs=100)
clear_output()",No,5,7.0
"fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
ax1.plot(fit_results.history['loss'], label='Training')
ax1.plot(fit_results.history['val_loss'], label='Validation')
ax1.legend()
ax1.set_title('Loss History')
ax2.plot(100*np.array(fit_results.history['binary_accuracy']), label='Training')
ax2.plot(100*np.array(fit_results.history['val_binary_accuracy']), label='Validation')
ax2.legend()
ax2.set_title('Accuracy History')",No,5,35.0
"# Import the necessary libraries
import numpy as np
import pandas as pd
import os
import time
import warnings
import gc
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')",No,5,23.0
"#Add All the Models Libraries

# Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.svm import SVC # Support Vector Classifier
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from scipy.stats import reciprocal, uniform

from sklearn.ensemble import AdaBoostClassifier


# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Common data processors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from scipy import sparse

#Accuracy Score
from sklearn.metrics import accuracy_score",No,5,22.0
"# to make this notebook's output stable across runs
np.random.seed(123)

# To plot pretty figures
%matplotlib inline
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12",No,5,23.0
"train = pd.read_csv(""../input/train_users_2.csv"")<br>test = pd.read_csv(""../input/test_users.csv"")<br>id_test = test[\'id\']<br>labels = train[\'country_destination\'].values<br>df_train = train.drop([\'country_destination\'], axis=1)<br>train_flag = df_train.shape[0]",Yes,3,45.0
"#We now concat Training and Test set
df_total = pd.concat((df_train, test),axis=0, ignore_index = True)",No,5,11.0
"df_total = df_total.drop(['id','date_first_booking'], axis=1)",No,5,10.0
"#Date Account created - Capture Date, month and year seperately.

date_ac = np.vstack(df_total.date_account_created.astype(str).apply(lambda x:list(map(int,x.split('-')))).values)
df_total['Day'] = date_ac[:,0]
df_total['Month']= date_ac[:,1]
df_total['year'] = date_ac[:,2]

df_total = df_total.drop(['date_account_created'],axis=1)",Yes,3,11.0
"#Time Stamp first active

time_stp = np.vstack(df_total.timestamp_first_active.astype(str)
                     .apply(lambda x: list(map(int,[x[:4],x[4:6],x[6:8],x[8:]]))).values)

df_total['tfa_day'] = time_stp[:,0]
df_total['tfa_Month'] = time_stp[:,1]
df_total['tfa_year'] = time_stp[:,2]

df_total = df_total.drop(['timestamp_first_active'],axis=1)",Yes,3,11.0
"forest_class = RandomForestClassifier(random_state = 42)

n_estimators = [100, 500]
min_samples_split = [10, 20]

param_grid_forest = {'n_estimators' : n_estimators, 'min_samples_split' : min_samples_split}


rand_search_forest = GridSearchCV(forest_class, param_grid_forest, cv = 4, refit = True,
                                 n_jobs = -1, verbose=2)

rand_search_forest.fit(final_train_X, train_set_y)",Yes,3,6.0
"df = pd.read_csv('../input/train.csv', parse_dates=[0])",No,5,45.0
"test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0
"df.rename(columns={'count': 'rentals'}, inplace=True)",No,5,61.0
df['rentals'] = np.log(df['rentals']),No,5,8.0
import math,No,5,22.0
"pd.concat([df, test])",No,5,11.0
"df = pd.concat([df, test])",No,5,11.0
"df['year'] = df.datetime.dt.year
df['hour'] = df.datetime.dt.hour
df['dayofweek'] = df.datetime.dt.dayofweek",No,5,8.0
"train, valid = train_test_split(df, random_state=42)",No,5,13.0
"removed_cols = ['rentals', 'casual', 'registered', 'datetime']",No,5,77.0
feats = [c for c in df.columns if c not in removed_cols],No,5,77.0
from sklearn.tree import DecisionTreeRegressor,No,5,22.0
"dt = DecisionTreeRegressor(random_state=42, max_depth=2)",No,5,4.0
"dt.fit(train[feats], train['rentals'])",No,5,7.0
from fastai.structured import draw_tree,No,5,22.0
from sklearn.metrics import mean_squared_error,No,5,22.0
"mean_squared_error(train['rentals'], train['preds'])",No,5,28.0
from sklearn.ensemble import RandomForestRegressor,No,5,22.0
"rf = RandomForestRegressor(random_state=42, n_jobs=-1)",No,5,4.0
"rf.fit(train[feats], train['rentals'])",No,5,7.0
train_preds = rf.predict(train[feats]),No,5,27.0
"mean_squared_error(train['rentals'], train_preds)**(1/2)",No,5,28.0
valid_preds = rf.predict(valid[feats]),No,5,48.0
"mean_squared_error(valid['rentals'], valid_preds)**(1/2)",No,5,49.0
"test[['datetime', 'count']].to_csv('rf.csv', index=False)",No,5,25.0
"# Prepare the train data
train_data = process_data(train_images, TRAIN_DIR, isTrain=True)
X = np.array([i[0] for i in train_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
y = np.array([i[1] for i in train_data])",No,5,21.0
plot_accuracy_and_loss(history),No,5,35.0
"import numpy as np<br>import pandas as pd<br>import keras<br>from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder<br>from sklearn.model_selection import train_test_split<br>from sklearn.metrics import f1_score<br>from matplotlib import pyplot as plt<br>import os<br>import re<br>print(os.listdir(""../input""))<br>def to_int(obj):<br>    return int(re.sub(""[^\\d]"", \'\', obj))",No,5,88.0
"df = pd.read_csv('../input/2TWH_train.csv', index_col='IDNum')
for col in df:
    if col[0] == ' ':
        df = df.rename(columns={col : col[1:]})
dt = pd.read_csv('../input/test.csv', index_col='IDNum')
for col in dt:
    if col[0] == ' ':
        dt = dt.rename(columns={col : col[1:]})",Yes,3,45.0
"df['Source IP'] = df['Source IP'].apply(to_int)
df['Destination IP'] = df['Destination IP'].apply(to_int)
df['Timestamp'] = df['Timestamp'].apply(to_int)
df['Flow Bytes/s'] = df['Flow Bytes/s'].astype(float) 
df['Flow Packets/s'] = df['Flow Packets/s'].astype(float) 

dt['Source IP'] = dt['Source IP'].apply(to_int)
dt['Destination IP'] = dt['Destination IP'].apply(to_int)
dt['Timestamp'] = dt['Timestamp'].apply(to_int)
dt['Flow Bytes/s'] = dt['Flow Bytes/s'].astype(float) 
dt['Flow Packets/s'] = dt['Flow Packets/s'].astype(float)",No,5,16.0
"df = df.replace([np.inf, 'Infinity', 'infinity', 'inf'], 2**31-1)
df = df.replace([np.nan, np.inf, 'NaN'], 0)
dt = dt.replace([np.inf, 'Infinity', 'infinity', 'inf'], 2**31-1)
dt = dt.replace([np.nan, 'NaN'], 0)",Yes,4,17.0
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
ohe = OneHotEncoder(categories='auto')
y_train_ohe = ohe.fit_transform(y_train.reshape(-1, 1))
y_test_ohe = ohe.fit_transform(y_test.reshape(-1, 1))
y_ohe = ohe.fit_transform(y.reshape(-1, 1))
scaler = StandardScaler()
X_scale = scaler.fit_transform(X.astype(float))
X_train_scale = scaler.fit_transform(X_train.astype(float))
X_test_scale = scaler.transform(X_test.astype(float))",Yes,3,13.0
"from keras import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=20, activation='relu', input_dim=X.shape[1]))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=3, activation='softmax'))
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])",No,5,4.0
"history = model.fit(X_train_scale,
                   y_train_ohe,
                   epochs=1000,
                   batch_size=8192,
                   validation_data=(X_test_scale, y_test_ohe))",No,5,7.0
"import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure(figsize = (10, 10))
plt.semilogy(epochs, loss, 'bo', label='Training loss')
plt.semilogy(epochs, val_loss, 'red', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()",No,5,35.0
"plt.clf()
history_dict = history.history
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']
plt.figure(figsize = (10, 10))
plt.semilogy(epochs, acc_values, 'bo', label='Training acc')
plt.semilogy(epochs, val_acc_values, 'red', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()",No,5,35.0
"# Load packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from random import shuffle
import os, gc, time, cv2, random, math

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

####################
# Global Constants #
####################
INCEPTION_V3_WEIGHTS_PATH = '../input/inceptionv3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
PATH = '../input/dogs-vs-cats-redux-kernels-edition/'
TRAIN_DIR = PATH+'train'
TEST_DIR =  PATH+'test'
NUM_CLASSES = 2
IMG_SIZE = 145  ###
CHANNELS = 3
EPOCHS = 30
BATCH_SIZE = 32

train_images = os.listdir(TRAIN_DIR)
test_images = os.listdir(TEST_DIR)

# # For testing purposes
# train_images = train_images[:10000]
# test_images = test_images[:100]",No,4,77.0
"# Plotting loss and accuracy for the model
def plot_accuracy_and_loss(history):
    eval_res = pd.DataFrame(history.history)
    f, ax = plt.subplots(1,2, figsize=(18,5))
    for i, c in enumerate(['acc', 'loss']):
        ax[i].plot(eval_res[[c]], label=f'Training {c}')
        ax[i].plot(eval_res[[f'val_{c}']], label=f'Validation {c}')
        ax[i].set_xlabel('Epoch'); ax[i].set_ylabel(c);
        ax[i].legend(); ax[i].set_title(f'Training and validation {c}'); plt.grid();
    plt.show()
plot_accuracy_and_loss(history)",No,5,35.0
"import os
print(os.listdir(""../input""))",No,5,88.0
"import pandas as pd

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression",No,5,22.0
"data_dir = Path('../input')

train_df = pd.read_csv(data_dir / 'train.csv')
test_df = pd.read_csv(data_dir / 'test.csv')
sample_submission = pd.read_csv(data_dir / 'sampleSubmission.csv')",No,5,45.0
"print(train_df.shape)
train_df.head()",Yes,3,58.0
"print(test_df.shape)
test_df.head()",Yes,3,58.0
"features = ['open', 'high', 'low', 'close', 'volume', 'trades', 'macd',
            'macd_hist', 'macd_signal', 'adx', 'di_plus', 'di_minus',
            'rsi', 'cci', 'adl']",No,5,77.0
"X_train = train_df[features]
y_train = train_df['y']
X_test = test_df[features]",No,5,21.0
"scaler = StandardScaler(with_std=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)",No,5,18.0
"regressor = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1,
                                  max_depth=7)
regressor.fit(X_train, y_train)",No,5,7.0
y_test = regressor.predict(X_test),No,5,48.0
"sample_submission.to_csv('submission.csv', index=False)",No,5,25.0
"import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir(""../input""))",No,5,88.0
"import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV",No,5,22.0
"import warnings
warnings.filterwarnings(""ignore"")",No,5,23.0
"data = pd.read_csv('../input/train.csv')
target = data.pop('y')
data_test = pd.read_csv('../input/test.csv')",No,5,45.0
"out_df = pd.DataFrame(np.concatenate(preds_df), columns=['id','expected'])
out_df.id = out_df.id.astype('int')
out_df.to_csv('by_asset_lgbm.csv',index = False)",Yes,4,25.0
"out_df = pd.DataFrame(np.concatenate(grid_preds_df), columns=['id','expected'])
out_df.id = out_df.id.astype('int')
out_df.to_csv('grid_by_asset.csv',index = False)",Yes,4,25.0
"out_df = pd.DataFrame(np.concatenate(preds_df), columns=['id','expected'])
out_df.id = out_df.id.astype('int')
out_df.to_csv('Elastic_net.csv',index = False)",Yes,4,25.0
"data_dir = Path('../input')

train_df = pd.read_csv(data_dir / 'train.csv')
test_df = pd.read_csv(data_dir / 'test.csv')
sample_submission = pd.read_csv(data_dir / 'sampleSubmission.csv')

train_df['d'] = train_df['close'] / train_df['open']
test_df['d'] = test_df['close'] / test_df['open']",Yes,4,45.0
"
features = ['asset','di_minus',
            'rsi', 'cci','volume']

features = ['asset', 'open', 'high', 'low', 'close', 'volume', 'trades', 'macd',
            'macd_hist', 'macd_signal', 'adx', 'di_plus', 'di_minus',
            'rsi', 'cci', 'adl']",No,5,77.0
"X_train = train_df[features]
y_train = train_df[['asset', 'y']]
X_test = test_df[features]",No,5,21.0
"from numpy import column_stack

scaler = StandardScaler()
X_train = pd.DataFrame(data=scaler.fit_transform(train_df[features]), columns=features)
X_train['asset'] = train_df['asset']
X_test = pd.DataFrame(data=scaler.transform(test_df[features]), columns=features)
X_test['asset'] = test_df['asset']",No,5,21.0
"regressor = TssRegressor()
regressor.fit(X_train, y_train)
y_test = [item[1] for item in regressor.predict(X_test)]
y_test[:12] = [0]*12
sample_submission['expected'] = y_test",Yes,3,7.0
"import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings(""ignore"", category=DeprecationWarning)
%matplotlib inline",No,5,23.0
"dailyData = pd.read_csv(""../input/train.csv"")",No,5,45.0
dailyData.shape,No,5,58.0
dailyData.head(2),No,5,41.0
dailyData.dtypes,No,5,70.0
"dailyData[""date""] = dailyData.datetime.apply(lambda x : x.split()[0])<br>dailyData[""hour""] = dailyData.datetime.apply(lambda x : x.split()[1].split("":"")[0])<br>dailyData[""weekday""] = dailyData.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])<br>dailyData[""month""] = dailyData.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month])<br>dailyData[""season""] = dailyData.season.map({1: ""Spring"", 2 : ""Summer"", 3 : ""Fall"", 4 :""Winter"" })<br>dailyData[""weather""] = dailyData.weather.map({1: "" Clear + Few clouds + Partly cloudy + Partly cloudy"",\\<br>                                        2 : "" Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist "", \\<br>                                        3 : "" Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds"", \\<br>                                        4 :"" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "" })",No,4,8.0
"dailyData  = dailyData.drop([""datetime""],axis=1)",No,5,10.0
"fig, axes = plt.subplots(nrows=2,ncols=2)<br>fig.set_size_inches(12, 10)<br>sn.boxplot(data=dailyData,y=""count"",orient=""v"",ax=axes[0][0])<br>sn.boxplot(data=dailyData,y=""count"",x=""season"",orient=""v"",ax=axes[0][1])<br>sn.boxplot(data=dailyData,y=""count"",x=""hour"",orient=""v"",ax=axes[1][0])<br>sn.boxplot(data=dailyData,y=""count"",x=""workingday"",orient=""v"",ax=axes[1][1])<br><br>axes[0][0].set(ylabel=\'Count\',title=""Box Plot On Count"")<br>axes[0][1].set(xlabel=\'Season\', ylabel=\'Count\',title=""Box Plot On Count Across Season"")<br>axes[1][0].set(xlabel=\'Hour Of The Day\', ylabel=\'Count\',title=""Box Plot On Count Across Hour Of The Day"")<br>axes[1][1].set(xlabel=\'Working Day\', ylabel=\'Count\',title=""Box Plot On Count Across Working Day"")",No,5,33.0
"dailyDataWithoutOutliers = dailyData[np.abs(dailyData[""count""]-dailyData[""count""].mean())<=(3*dailyData[""count""].std())] ",No,5,14.0
"print (""Shape Of The Before Ouliers: "",dailyData.shape)
print (""Shape Of The After Ouliers: "",dailyDataWithoutOutliers.shape)",No,5,58.0
"dataTrain = pd.read_csv(""../input/train.csv"")
dataTest = pd.read_csv(""../input/test.csv"")",No,5,45.0
"data = dataTrain.append(dataTest)
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)",Yes,4,10.0
"data[""date""] = data.datetime.apply(lambda x : x.split()[0])
data[""hour""] = data.datetime.apply(lambda x : x.split()[1].split("":"")[0]).astype(""int"")
data[""year""] = data.datetime.apply(lambda x : x.split()[0].split(""-"")[0])
data[""weekday""] = data.date.apply(lambda dateString : datetime.strptime(dateString,""%Y-%m-%d"").weekday())
data[""month""] = data.date.apply(lambda dateString : datetime.strptime(dateString,""%Y-%m-%d"").month)",No,5,8.0
"categoricalFeatureNames = [""season"",""holiday"",""workingday"",""weather"",""weekday"",""month"",""year"",""hour""]<br>numericalFeatureNames = [""temp"",""humidity"",""windspeed"",""atemp""]<br>dropFeatures = [\'casual\',""count"",""datetime"",""date"",""registered""]",No,5,77.0
"for var in categoricalFeatureNames:
    data[var] = data[var].astype(""category"")",No,5,16.0
"dataTrain  = dataTrain.drop(dropFeatures,axis=1)
dataTest  = dataTest.drop(dropFeatures,axis=1)",No,5,10.0
"from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings(""ignore"", category=DeprecationWarning)

# Initialize logistic regression model
lModel = LinearRegression()

# Train the model
yLabelsLog = np.log1p(yLabels)
lModel.fit(X = dataTrain,y = yLabelsLog)

# Make predictions
preds = lModel.predict(X= dataTrain)
print (""RMSLE Value For Linear Regression: "",rmsle(np.exp(yLabelsLog),np.exp(preds),False))",Yes,3,7.0
"from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01); ### Test 0.41
yLabelsLog = np.log1p(yLabels)
gbm.fit(dataTrain,yLabelsLog)
preds = gbm.predict(X= dataTrain)
print (""RMSLE Value For Gradient Boost: "",rmsle(np.exp(yLabelsLog),np.exp(preds),False))",Yes,3,7.0
"predsTest = gbm.predict(X= dataTest)
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sn.distplot(yLabels,ax=ax1,bins=50)
sn.distplot(np.exp(predsTest),ax=ax2,bins=50)",No,5,56.0
"submission = pd.DataFrame({<br>        ""datetime"": datetimecol,<br>        ""count"": [max(0, x) for x in np.exp(predsTest)]<br>    })<br>submission.to_csv(\'bike_predictions_gbm_separate_without_fe.csv\', index=False)",No,5,25.0
"import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
%matplotlib inline",No,5,23.0
train = pd.read_csv('../input/training/training.csv'),No,5,45.0
train.dropna(inplace=True),No,5,17.0
train.tail(3),No,5,41.0
test = pd.read_csv('../input/test/test.csv'),No,5,45.0
"train.shape, test.shape",No,5,58.0
"x = np.stack(train.Image)[..., None]",No,5,11.0
x.shape,No,5,58.0
"x_t = np.stack(test.Image)[..., None]",No,5,11.0
x_t.shape,No,5,58.0
"from IPython.display import SVG
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPool2D, Flatten, LeakyReLU, ELU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils.vis_utils import model_to_dot",No,5,22.0
"np.random.seed(777)

model10 = Sequential()

model10.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1)))
model10.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model10.add(Dropout(0.3))

model10.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu'))
model10.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model10.add(Dropout(0.5))

model10.add(Flatten())
model10.add(Dense(128, activation = 'relu'))
model10.add(Dropout(0.7))
model10.add(Dense(30, activation = 'relu'))",No,5,4.0
MODEL_DIR = '../model/',No,5,77.0
modelpath = '../model/{epoch:02d}-{val_loss:4f}.hdf5',No,5,77.0
predict = model10.predict(x),No,5,27.0
"train_loss = history.history['loss']
val_loss = history.history['val_loss']

x_len = np.arange(len(train_loss))
plt.plot(x_len, train_loss, marker='.', c='red', label='Train_loss')
plt.plot(x_len, val_loss, marker='.', c='blue', label='Val_loss')

plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()",No,5,35.0
y_t = model10.predict(x_t),No,5,48.0
look_id = pd.read_csv('../input/IdLookupTable.csv'),No,5,45.0
look_id.info(),No,5,40.0
"look_id.drop('Location', axis=1, inplace=True)",No,5,10.0
"look_id[['RowId','Location']].to_csv('Predict.csv',index=False)",No,5,25.0
"from xgboost import XGBClassifier
import xgboost as xgb",No,5,22.0
"#for scaling
from sklearn.preprocessing import StandardScaler",No,5,22.0
"data = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
",No,5,45.0
print(data.shape),No,5,58.0
data.columns,No,5,71.0
"df=pd.concat([df,Date,Time],axis=1)
#df

#-----------test data------------------

df_test=pd.concat([df_test,Date_test,Time_test],axis=1)
",No,5,11.0
"df=df.drop(labels=['Dates'],axis=1)

#-----------test data------------------

df_test=df_test.drop(labels=['Dates'],axis=1)
",No,5,10.0
df.columns,No,5,71.0
"df[""rot60_X""]=(0.5) * df[""Y""] + (1.732/2) * df[""X""]
df[""rot60_Y""]=0.5 * df[""Y""] - (1.732/2) * df[""X""]


df_test[""rot60_X""]=(0.5) * df_test[""Y""] + (1.732/2) * df_test[""X""]
df_test[""rot60_Y""]=0.5 * df_test[""Y""] - (1.732/2) * df_test[""X""]

df[""rot45_X""]=0.707 * df[""Y""] + 0.707 * df[""X""]
df[""rot45_Y""]=0.707 * df[""Y""] - 0.707 * df[""X""]

df_test[""rot45_X""]=0.707 * df_test[""Y""] + 0.707 * df_test[""X""]
df_test[""rot45_Y""]=0.707 * df_test[""Y""] - 0.707 * df_test[""X""]

df[""rot30_X""]=(1.732/2) * df[""Y""] + 0.5 * df[""X""]
df[""rot30_Y""]=(1.732/2) * df[""Y""] - 0.5 * df[""X""]

df_test[""rot30_X""]=(1.732/2) * df_test[""Y""] + 0.5 * df_test[""X""]
df_test[""rot30_Y""]=(1.732/2) * df_test[""Y""] - 0.5 * df_test[""X""]
",No,5,8.0
"df[""radial60""]=np.sqrt(np.power(df[\'rot60_X\'],2) + np.power(df[\'rot60_Y\'],2))<br><br>df_test[""radial60""]=np.sqrt(np.power(df_test[\'rot60_X\'],2) + np.power(df_test[\'rot60_Y\'],2))",No,5,8.0
"df=df.drop(labels='rot60_X',axis=1)

df_test=df_test.drop(labels='rot60_X',axis=1)",No,5,10.0
"df=df.drop(labels='rot60_Y',axis=1)

df_test=df_test.drop(labels='rot60_Y',axis=1)",No,5,10.0
"df=df.drop(labels='Second',axis=1)

df_test=df_test.drop(labels='Second',axis=1)",No,5,10.0
"df['Minute']=df['Minute'].apply(lambda x:int(x))
df['Minute']=df['Minute'].apply(lambda x : 'low' if x <31 else 'high')

df_test['Minute']=df_test['Minute'].apply(lambda x:int(x))
df_test['Minute']=df_test['Minute'].apply(lambda x : 'low' if x <31 else 'high')
",No,5,8.0
"df['DayOfWeek']= df['DayOfWeek'].apply(lambda x : 'WeekHigh' if x in ('Wednesday','Friday') else ('WeekMed' if x in ('Tuesday','Thursday','Saturday') else 'WeekLow'))


df_test['DayOfWeek']= df_test['DayOfWeek'].apply(lambda x : 'WeekHigh' if x in ('Wednesday','Friday') else ('WeekMed' if x in ('Tuesday','Thursday','Saturday') else 'WeekLow'))
",No,5,8.0
"df['Intersection']=df['Address'].apply(lambda x : 1 if '/' in x else 0)
df['Block']=df['Address'].apply(lambda x : 1 if 'Block' in x else 0)
df_test['Intersection']=df_test['Address'].apply(lambda x : 1 if '/' in x else 0)
df_test['Block']=df_test['Address'].apply(lambda x : 1 if 'Block' in x else 0)",No,5,8.0
"Id=df['Id']
df=df.drop(['Descript','Resolution','Id'],axis=1)

#----------test data---------

Id_test=df_test['Id']
df_test=df_test.drop(['Descript','Resolution','Id'],axis=1)",No,5,10.0
"lasso = linear_model.Lasso(alpha=0.1)
score1 = cross_val_score(lasso, X_train, y_train, cv=10)
score1.mean()",No,5,28.0
"ridge = linear_model.Ridge(alpha=0.1)
score2 = cross_val_score(ridge, X_train, y_train, cv=10)
score2.mean()",No,5,28.0
from sklearn.neighbors import KNeighborsRegressor,No,5,22.0
"score3 = []
neighbor = []
for k in range(10, 100, 10):
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    score3.append(cross_val_score(knn, X_train, y_train, cv=10).mean())
    neighbor.append(k)",No,5,2.0
score3,No,5,53.0
"plt.plot(neighbor, score3, 'ro')",No,5,33.0
"score3 = []
neighbor = []
for k in range(1, 32, 2):
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    score3.append(cross_val_score(knn, X_train, y_train, cv=10).mean())
    neighbor.append(k)",No,5,84.0
neighbor,No,5,41.0
"knn = KNeighborsRegressor(n_neighbors=21)
score3 = cross_val_score(knn, X_train, y_train, cv=10)
score3.mean()",No,5,28.0
"score4 = []
trees = []
for n in range(10,101,10):
    regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=n)
    score4.append(cross_val_score(regr, X_train, y_train, cv=10).mean())
    trees.append(n)",No,2,27.0
"plt.plot(trees, score4, 'ro')",No,5,56.0
"regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=80)
score4.append(cross_val_score(regr, X_train, y_train, cv=10).mean())",No,5,28.0
score4[-1],No,5,41.0
"df[""raw_radial""]=np.sqrt(np.power(df[\'X\'],2) + np.power(df[\'Y\'],2))<br><br>df_test[""raw_radial""]=np.sqrt(np.power(df_test[\'X\'],2) + np.power(df_test[\'Y\'],2))",No,5,8.0
test = pd.read_csv('../input/datasetss/test.csv'),No,5,45.0
"le_res=le.fit_transform(df['Category'])
cat=pd.DataFrame(le_res)
cat.columns=['Category']
df=df.drop(labels=['Category'],axis=1)
df=pd.concat([cat,df],axis=1)

df.columns",Yes,3,20.0
"X_test = test.drop('Id',axis=1)",No,5,10.0
"lasso.fit(X_train, y_train)",No,5,7.0
prediction1 = lasso.predict(X_test),No,5,48.0
df_test.columns,No,5,71.0
"df=df[['Address', 'Minute', 'Hour', 'Day', 'Month', 'Year',
       'District', 'DayOfWeek', 'X', 'Y', 'rot45_X', 'rot45_Y', 'rot30_X',
       'rot30_Y', 'radial60', 'Intersection', 'Block', 'raw_radial',
       'closest_centers_f', 'label']]


df_test=df_test[['Address', 'Minute', 'Hour', 'Day', 'Month', 'Year', 'District',
       'DayOfWeek', 'X', 'Y', 'rot45_X', 'rot45_Y', 'rot30_X', 'rot30_Y',
       'radial60', 'Intersection', 'Block', 'raw_radial', 'closest_centers_f',
       'label']]",No,5,10.0
"prediction1 = abs(prediction)
prediction1[0]",Yes,4,55.0
"prediction = pd.DataFrame({'Id':test.Id,'median_house_value': prediction1[0]})",No,5,55.0
prediction,No,5,41.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br>import matplotlib.pyplot as plt<br>import nltk<br>import string<br>from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer<br>from collections import Counter<br>from nltk.corpus import stopwords<br>from sklearn.linear_model import LogisticRegressionCV,SGDClassifier<br>from nltk.stem import WordNetLemmatizer<br>from nltk.tokenize import TreebankWordTokenizer<br>from sklearn.model_selection import train_test_split<br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br>from sklearn.svm import SVC<br>import os<br>print(os.listdir(""../input""))<br>from wordcloud import WordCloud<br>from sklearn.metrics import accuracy_score<br>from sklearn.ensemble import RandomForestClassifier<br>from xgboost import XGBClassifier<br># Any results you write to the current directory are saved as output.",No,5,88.0
train_df = pd.read_csv('../input/train.csv'),No,5,45.0
train_df.head(),No,5,41.0
"train_df['president'].value_counts().plot(kind = 'bar')
plt.show()",No,5,33.0
"#Independent Column
X=df
X.shape
",No,5,58.0
"#Dependent 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,shuffle=False)",No,5,13.0
"train_data = []
for i, row in train_df.iterrows():
    for text in row['text'].split('.'):
        train_data.append([row['president'], text])
train_data = pd.DataFrame(train_data, columns=['president', 'text'])",No,5,12.0
"train_data['president'].value_counts().plot(kind = 'bar')
plt.show()",No,5,33.0
train_data.head(),No,5,41.0
"def remove_punctuation_numbers(text):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in text if l not in punc_numbers])",No,5,78.0
"def tokeniser(text):
    return TreebankWordTokenizer().tokenize(text)",No,5,78.0
"def lemmetizer(tokens):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(word) for word in tokens]",No,5,84.0
"def remove_stop_words(tokens):
    return [t for t in tokens if t not in set(stopwords.words('english'))]",No,5,84.0
"def data_cleaner(text):
    text = text.lower()
    text = remove_punctuation_numbers(text)
    lst = tokeniser(text)
    lst = remove_stop_words(lst)
    return ' '.join(lemmetizer(lst))",No,5,78.0
train_data['clean_text'] = train_data['text'].apply(data_cleaner),No,5,8.0
"for pres in train_data['president'].unique():
    words =[]
    for sentence in train_data[train_data['president'] == pres].clean_text:
        words.extend(tokeniser(sentence))
    
    wordcloud = WordCloud().generate_from_frequencies(frequencies=Counter(words))
    plt.figure(figsize=(12,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(pres)
    plt.show()",No,5,53.0
"train_data.president = train_data.president.map({'deKlerk':0,'Mandela':1,
                                                'Mbeki':2, 'Motlanthe':3,
                                                'Zuma': 4, 'Ramaphosa':5})",No,5,20.0
"X = train_data.clean_text
y = train_data.president
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=0,
                                                    stratify=y)",Yes,4,13.0
"vect = CountVectorizer(ngram_range=(1,2))",No,5,84.0
X_train_ = vect.fit_transform(X_train),No,5,8.0
"log = LogisticRegressionCV(dual=False, penalty='l2', multi_class='multinomial')",No,5,4.0
"log.fit(X_train_, y_train)",No,5,7.0
"print(accuracy_score(y_train, log.predict(X_train_)))",No,5,28.0
"print(accuracy_score(y_test, log.predict(vect.transform(X_test))))",No,5,49.0
test_data = pd.read_csv('../input/test.csv'),No,5,45.0
test_data.head(),No,5,41.0
test_data.text = test_data.text.apply(data_cleaner),No,5,8.0
test_data['president'] = log.predict(vect.transform(test_data.text)),No,5,48.0
"test_data.drop('text', axis=1, inplace=True,)",No,5,10.0
"test_data.to_csv('Thapelo_log.csv',index=False)",No,5,25.0
"import pandas as pd
import numpy as np
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette='muted',
        rc={'figure.figsize': (15,10)})",Yes,4,22.0
"print(train.shape, test.shape)",No,5,58.0
"pres = {'deKlerk': 0,
        'Mandela': 1,
        'Mbeki': 2,
        'Motlanthe': 3,
        'Zuma': 4,
        'Ramaphosa': 5}

train.replace({'president': pres}, inplace=True)",No,5,20.0
"# speech number: intro lines
starts = {
    0: 1,
    1: 1,
    2: 1,
    3: 12,
    4: 12,
    5: 5,
    6: 1,
    7: 1,
    8: 8,
    9: 9,
    10: 12,
    11: 14,
    12: 14,
    13: 15,
    14: 15,
    15: 15,
    16: 15,
    17: 15,
    18: 15,
    19: 15,
    20: 20,
    21: 1,
    22: 15,
    23: 20,
    24: 20,
    25: 15,
    26: 15,
    27: 20,
    28: 20,
    29: 15,
    30: 18
}",No,5,77.0
"def divide_on(df, char):
    
    # iterate over text column of DataFrame, splitting at each occurrence of char

    sentences = []
    # let's split the data into senteces
    for i, row in df.iterrows():
        
        # skip the intro lines of the speech
        for sentence in row['text'].split(char)[starts[i]:]:
            sentences.append([row['president'], sentence])

    df = pd.DataFrame(sentences, columns=['president', 'text'])
    
    return df[df['text'] != '']",No,5,78.0
"train = divide_on(train, '.')",No,5,53.0
train.head(5),No,5,41.0
train['president'].value_counts(),No,5,72.0
"# proportion of total
train['president'].value_counts()/train.shape[0]",No,5,72.0
"train['sentence'] = None
test['president'] = None

df = pd.concat([train, test], axis=0, sort=False)",No,5,11.0
"# reorder columns
df = df[['sentence', 'text', 'president']]",No,5,10.0
df.tail(),No,5,41.0
"def fixup(text):<br>    <br>    # remove punctuation<br>    text = \'\'.join([char for char in text if char == \'-\' or char not in string.punctuation])<br>    # remove special characters<br>    text = text.replace(r\'^[*-]\', \'\')<br>    # remove numbers<br>    text = \'\'.join([char for char in text if not char.isdigit()])<br>    # lowercase<br>    text = text.lower()<br>    <br>    # remove hanging whitespace<br>    text = "" "".join(text.split())<br>    <br>    return text<br><br><br>df[\'text\'] = df[\'text\'].apply(fixup)",No,5,78.0
"# get length of sentence as variable
df['length'] = df['text'].apply(len)",No,5,8.0
"# what are our longest sentences?
df.sort_values(by='length', ascending=False).head(10)",No,5,41.0
df.loc[3930][1],No,5,14.0
"# what are our shortest sentences?
df.sort_values(by='length').head(5)",No,5,41.0
"# let's check the shortest sentences in our test set
df[pd.isnull(df['president'])].sort_values(by='length').head()",No,5,41.0
"# sentences with just a few characters are of no use to us
df = df[df['length']>10]",No,5,14.0
"# what are our shortest sentences now?
df.sort_values(by='length').head(5)",No,5,41.0
df['president'].value_counts(),No,5,72.0
"from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer",No,5,22.0
"tfidf = TfidfVectorizer(strip_accents='unicode', ngram_range=(1,3), stop_words='english', min_df=6)
X = tfidf.fit_transform(df['text']).todense()
X.shape",Yes,4,8.0
tfidf.get_feature_names(),No,5,53.0
"X = pd.DataFrame(data=X, columns=tfidf.get_feature_names())",No,5,21.0
"df = df.drop(columns=['text', 'length'], axis=1)",No,5,10.0
"X = pd.DataFrame(np.hstack((df, X)))",No,5,12.0
"HYPER_PARAMS = {
 
  'learning_rate': 0.02,

 'n_estimators':800,
 'max_depth': 6,
 'subsample': 0.8,
 'colsample_bytree': 0.8,
 'max_delta_step': 1,
 'objective': 'multi:softmax',
 'nthread': 4,
 'seed': 1747
 

}


model = xgb.XGBClassifier(**HYPER_PARAMS)
model.fit(X,y)

",No,5,7.0
"X.columns = ['sentence_id', 'president_id'] + tfidf.get_feature_names()",No,5,61.0
y_pred=model.predict_proba(df_test),No,5,48.0
"train = X[pd.isnull(X['sentence_id'])]
test = X[pd.notnull(X['sentence_id'])]",No,5,14.0
"X_train = train.drop(['sentence_id', 'president_id'], axis=1)
X_test = test.drop(['sentence_id', 'president_id'], axis=1)",No,5,10.0
"y_pred= pd.DataFrame(y_pred, index=Id_test,columns  = le.classes_)",No,5,12.0
"y_pred.to_csv(""submit.csv"", float_format = \'%.5F\')",No,5,25.0
"def one_hot_encode(label):
    
    # initialize zero array
    vec = [0, 0, 0, 0, 0, 0]
    
    # set index of array corresponding to label = 1
    vec[label] = 1
    
    return vec

# save encoded labels as target for model
y_train = np.vstack(row for row in train['president_id'].apply(one_hot_encode).values)",No,5,20.0
y_train[600],No,5,41.0
"print('Train size:', X_train.shape)
print('Test size:', X_test.shape)",No,5,58.0
"# This Python 3 environment comes with many helpful analytics libraries installed<br># It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python<br># For example, here\'s several helpful packages to load in <br><br>import numpy as np # linear algebra<br>import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)<br><br># Input data files are available in the ""../input/"" directory.<br># For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory<br>import os<br>print(os.listdir(""../input""))<br><br># Any results you write to the current directory are saved as output.",No,5,88.0
"# importing the modulus
from keras.preprocessing.image import ImageDataGenerator,img_to_array,load_img
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense  , Activation
from keras import applications
import re",No,5,22.0
"def create_model(lyrs=[X_train.shape[1], 1028, 512, 256], act='relu', opt='Adam', dr=0.25):
    
    model = Sequential()
    
    # create first hidden layer
    model.add(Dense(lyrs[0], input_dim=X_train.shape[1], activation=act))
    
    # create additional hidden layers
    for i in range(1,len(lyrs)):
        model.add(Dense(lyrs[i], activation=act))
    
    # add dropout, default is none
    model.add(Dropout(dr))
    
    # create output layer
    model.add(Dense(6, activation='softmax'))  # output layer
    
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    return model",No,5,4.0
"model = create_model()
print(model.summary())",No,5,84.0
"# train model on full train set, with 80/20 CV split<br>training = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)<br>val_acc = np.mean(training.history[\'val_acc\'])<br>print(""\<br>%s: %.2f%%"" % (\'val_acc\', val_acc*100))",Yes,4,7.0
"# summarize history for accuracy
plt.plot(training.history['acc'])
plt.plot(training.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()",No,5,35.0
predictions = model.predict(X_test),No,5,48.0
"pred_lbls = []
for pred in predictions:
    pred = list(pred)
    max_value = max(pred)
    max_index = pred.index(max_value)
    pred_lbls.append(max_index)

predictions = np.array(pred_lbls)",No,5,55.0
predictions.shape,No,5,58.0
test['president_id'] = predictions,No,5,8.0
test['president_id'].value_counts(),No,5,72.0
"submission = test[['sentence_id','president_id']]
submission.columns = ['sentence', 'president']
submission.to_csv('rnn_1.csv', index=False)",No,5,25.0
submission.president.value_counts(),No,5,72.0
"#Dataframes etc
import pandas as pd
import numpy as np


#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.colors import ListedColormap
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8
sns.set_style('whitegrid')

#Machine learning:
from sklearn import preprocessing

## ML Cross validation and metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics


## ML models 
from sklearn.linear_model import LogisticRegression

#Natural language processing
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
## to automate the NLP extraction
from sklearn.feature_extraction.text import CountVectorizer
",No,5,23.0
"df=pd.read_csv('../input/train.csv')
df.head()
",Yes,4,45.0
df['president'].unique(),No,5,57.0
"df.info()
#no nulls, nothing unexpected",No,5,40.0
"
dict = {'deKlerk': 0,
        'Mandela': 1, 
        'Mbeki': 2,
        'Motlanthe': 3, 
        'Zuma': 4,
        'Ramaphosa': 5}

",No,5,77.0
"df['presi_num']= df['president']
df['presi_num']=df['presi_num'].replace(dict)
df.head()",Yes,4,8.0
"df['speech length']=df['text'].str.len()
df.head()",Yes,4,8.0
"df_summary = pd.DataFrame(df.groupby('president')['speech length'].mean())
df_summary=df_summary.reset_index()
df_summary = df_summary.sort_values(by='speech length')
sns.barplot(data = df_summary, x='president', y='speech length')",No,5,33.0
"df_summary['speech_length_%']=df_summary['speech length'].div(df_summary['speech length'].sum(), axis=0).multiply(100)
df_summary.head()",Yes,4,8.0
list(df_summary['speech_length_%']),No,5,41.0
"df_summary['presi_num']= df_summary['president']
df_summary['presi_num']= df_summary['presi_num'].replace(dict)
df_summary.sort_values(by='presi_num', ascending = True)
df_summary",Yes,4,8.0
"#check
list(df_summary['presi_num'])",No,5,41.0
"import string

def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

",No,5,78.0
# from sklearn.model_selection import GridSearchCV\n# from xgboost import XGBRegressor\n\n# y_Train = X_Train.ConfirmedCases\n# hyperParam = {\,No,5,76.0
\,No,5,76.0
#print (type(Y_train_CC))\n#X_train_CC.info(),No,5,76.0
"#print (X_train_CC.shape, X_train_Fat.shape, X_test.shape)",No,5,76.0
"#TODO: check duplicates,missing numeric, string, typo.",No,5,76.0
"#cols_with_missing = [col for col in X_train.columns \n#                                 if X_train[col].isnull().any()]\n#X_train = X_train.drop(cols_with_missing, axis=1)\n#X_test  = X_test.drop(cols_with_missing, axis=1)",No,5,76.0
#NO NULL,No,5,76.0
#gbr = GradientBoostingRegressor(random_state=17),No,5,76.0
# \n\n# df_train.isna().sum()\n# df_test.isna().sum()\n\n# ,No,5,76.0
# \n\n# pd.DataFrame(df_train).dtypes,No,5,76.0
"#dftrain.groupby('City')['revenue'].agg(['count','mean'])",No,5,76.0
"#P1 int testset has no 7,8,10,11,13,14,16 values skew 1.8\n#P2 FLOAT skew 0.03 test set only few decima values between 1 and 2, 4 and 5, \n#P3 float skew 0.14 test set only some decimal values between 4 and 5\n#P10 int64 skew 1.70 test set 80% values are 5, 15% values are 10, test set exactly like train set.\n#P22 int64 skew 0.79 test set 25% values are 1, 20% 2, 15% 3 , 12% 4, 9% 5 gradually decreasing distribution\n#P23 int64 skew 3.24 test set 40% values are 1, 15% 2, 12% 3, 9% 4, 14% 5, 3-4% each 10 15 20 25\n#P10 train and test exactly thesame",No,5,76.0
"#no log transform = P2, P3, P7,",No,5,76.0
#for col in dftrain.columns:\n #   if (dftrain[col].dtype == int) | (dftrain[col].dtype == float):\n  #      print (col)\n   #     print (skew(dftrain[col]))\n    #    print (skew(np.log1p(dftrain[col]))),No,5,76.0
# import pandas_profiling as pdp\n# pdp.ProfileReport(train),No,5,76.0
"# city_rev = []\n\n# for i in train['City']:\n#     for key, value in mean_dict.items():\n#         if i == key:\n#             city_rev.append(value)\n            \n# df_city_rev = pd.DataFrame({'city_rev':city_rev})\n# train = pd.concat([train,df_city_rev],axis=1)\n# train.head()",No,5,76.0
"# train.replace({""City"":mean_dict}, inplace=True)\n# test.replace({""City"":mean_dict}, inplace=True)\n# test[\",No,5,76.0
# train.iloc[list(tukey_outliers(df_num.acceleration).index)],No,5,76.0
"# \n# for i in range(len(num_list)):\n#      # \n#     upper_lim = full_data[num_list[i]].quantile(.95)\n#     lower_lim = full_data[num_list[i]].quantile(.05)\n    \n#     # IQR\n#     Q1 = full_data[num_list[i]].quantile(.25)\n#     Q3 = full_data[num_list[i]].quantile(.75)\n#     IQR = Q3 - Q1\n#     outlier_step = 1.5 * IQR\n    \n#     # 1.5IQR95%tile5%tile\n#     full_data.loc[(full_data[num_list[i]] > (Q3 + outlier_step)), num_list[i]] =upper_lim\n#     full_data.loc[(full_data[num_list[i]] < (Q1 - outlier_step)), num_list[i]] = lower_lim",No,5,76.0
"# columns = len(num_list)/4+1\n\n# # boxplot\n# fig = plt.figure(figsize=(15,20))\n# plt.subplots_adjust(hspace=0.2, wspace=0.8)\n# for i in range(len(num_list)):\n#     ax = fig.add_subplot(columns, 4, i+1)\n#     sns.boxplot(y=full_data[num_list[i]], data=full_data, ax=ax)\n# plt.show()",No,5,76.0
"# skew_col = skewed_data[skewed_data > 10].index\n\n# # \n# fig = plt.figure(figsize=(10, 8))\n# for i in range(len(skew_col)):\n#     ax = fig.add_subplot(2, 3, i+1)\n#     try:\n#         sns.distplot(combined_df[skew_col[i]], fit=norm, ax=ax)\n#     except:\n#         # kdekde=False\n#         sns.distplot(combined_df[skew_col[i]], fit=norm, kde=False, ax=ax)\n# plt.show()\n\n# # \n# for i in range(len(skew_col)):\n#     combined_df[skew_col[i]] = np.log1p(combined_df[skew_col[i]])\n    \n#     # \n# # \n# fig = plt.figure(figsize=(10, 8))\n# for i in range(len(skew_col)):\n#     ax = fig.add_subplot(2, 3, i+1)\n#     try:\n#         sns.distplot(combined_df[skew_col[i]], fit=norm, ax=ax)\n#     except:\n#         # kdekde=False\n#         sns.distplot(combined_df[skew_col[i]], fit=norm, kde=False, ax=ax)\n# plt.show()",No,5,76.0
"# #LightGBM\n# import lightgbm as lgb\n# #\n# import optuna\n\n# lgb_train = lgb.Dataset(X_train, y_train)\n# lgb_eval = lgb.Dataset(X_test, y_test)",No,5,76.0
"# def objective(trial):\n#     params = {'metric': {'rmse'},\n#               'max_depth' : trial.suggest_int('max_depth', 1, 10),\n#               'subsumple' : trial.suggest_uniform('subsumple', 0.0, 1.0),\n#               'subsample_freq' : trial.suggest_int('subsample_freq', 0, 1),\n#               'leaning_rate' : trial.suggest_loguniform('leaning_rate', 1e-5, 1),\n#               'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.0, 1.0),\n#               'lambda_l1' : trial.suggest_uniform('lambda_l1' , 0.0, 1.0),\n#               'lambda_l2' : trial.suggest_uniform('lambda_l2' , 0.0, 1.0)}\n \n#     gbm = lgb.train(params,\n#                     lgb_train,\n#                     valid_sets=(lgb_train, lgb_eval),\n#                     num_boost_round=10000,\n#                     early_stopping_rounds=100,\n#                     verbose_eval=50)\n#     predicted = gbm.predict(X_test)\n#     RMSE = np.sqrt(mean_squared_error(y_test, predicted))\n    \n#     pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')\n#     return RMSE",No,5,76.0
"# study = optuna.create_study()\n# study.optimize(objective, timeout=360)",No,5,76.0
# print(\,No,5,76.0
"\n# #Optuna\n# params = {""metric"": {\",No,5,76.0
"# #\n# lgb.plot_importance(gbm, height=0.5, figsize=(8,16))",No,5,76.0
# \n# prediction_log = gbm.predict(test_X)\n# print(prediction_log)\n# prediction =np.exp(prediction_log) \n# print(prediction),No,5,76.0
"# cap revenue at 10,000,000 for outliers\n# df.loc[df['revenue'] > 10000000, 'revenue'] = 10000000",No,5,76.0
"# train_profile = ProfileReport(train, title='Pandas Profiling Report', html={'style':{'full_width':True}})\n# train_profile",No,5,76.0
"# test_profile = ProfileReport(test, title='Pandas Profiling Report', html={'style':{'full_width':True}})\n# test_profile",No,5,76.0
# date_encoded = {}\n# for s in train[\,No,5,76.0
# train['date_encoded'] = train['Date'].apply(lambda x: date_encoded[x])\n# train['date_encoded'] = (train['date_encoded'] - train['date_encoded'].mean()) / train['date_encoded'].std()\n# train.head(),No,5,76.0
#INVERSE TRANSFORM\n#pred_f = pred_f*scale,No,5,76.0
"#result.set_index('ForecastId', inplace=True)",No,5,76.0
"#result= result[['ConfirmedCases','Fatalities']].round(0)",No,5,76.0
" #submission['ConfirmedCases'] = [0 if submission.loc[i, 'ConfirmedCases'] <= -0 \n             #                   else submission.loc[i, 'ConfirmedCases'] for i in submission.index]",No,5,76.0
"'''train = df_train.values\nX_train, y_train = train[:,:-2], train[:,-2:]'''",No,5,76.0
"'''model1 = XGBRegressor(\n learning_rate =0.1,\n n_estimators=1000,\n max_depth=5,\n min_child_weight=1,\n gamma=0,\n subsample=0.8,\n colsample_bytree=0.8,\n objective= 'reg:squarederror',\n scale_pos_weight=1)\nmodelfit(model1, X_train, y_train[:,0])'''",No,5,76.0
"'''model2 = XGBRegressor(\n learning_rate =0.1,\n n_estimators=1000,\n max_depth=5,\n min_child_weight=1,\n gamma=0,\n subsample=0.8,\n colsample_bytree=0.8,\n objective= 'reg:squarederror',\n scale_pos_weight=1)\nmodelfit(model2, X_train, y_train[:,1])'''",No,5,76.0
"'''df_submit.ConfirmedCases = df_submit.ConfirmedCases.apply(lambda x:max(0,round(x,0)))\ndf_submit.Fatalities = df_submit.Fatalities.apply(lambda x:max(0,round(x,0)))'''",No,5,76.0
#?TabularPandas,No,5,76.0
"# estimators = [('rf',RF_model ), ('ada', adaboost_model_for_ConfirmedCases)]\n# stacking_model_for_ConfirmedCases = StackingClassifier(estimators=estimators, n_jobs=4)\n# stacking_model_for_ConfirmedCases.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[0]])",No,5,76.0
"# stacking_model_for_Fatalities = StackingClassifier(estimators=estimators, n_jobs=4)\n# stacking_model_for_Fatalities.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[1]])",No,5,76.0
"# predicted = stacking_model_for_ConfirmedCases.predict(test_numeric_X)\n# predicted2 = stacking_model_for_Fatalities.predict(test_numeric_X)\n\n# submission = np.vstack((test['ForecastId'], predicted,predicted2)).T\n# submission = submission.astype(np.int32)\n\n# df = pd.DataFrame(data=submission, columns=['ForecastId','ConfirmedCases','Fatalities'])\n# df.to_csv('stacking_submission.csv', index=False)\n# df.to_csv('submission.csv', index=False)",No,5,76.0
# from sklearn.neighbors import KNeighborsClassifier\n# from sklearn.naive_bayes import GaussianNB \n# from sklearn.linear_model import LogisticRegression\n# from sklearn import model_selection\n# from mlxtend.classifier import StackingCVClassifier,No,5,76.0
# clf1 = KNeighborsClassifier(n_neighbors=100)\n# clf2 = RandomForestClassifier(n_estimators=5)\n# clf3 = GaussianNB()\n# # Logit will be used for stacking\n# lr = LogisticRegression(solver=\,No,5,76.0
"# Tried adding spaCy coref as a feature. I have no expertise with this library.\n# It seems to work in many cases, but for some cases the coref resolves to just \n# he/she/etc rather than a noun. Not sure if it is because the coref model is \n# not confident, I'm navigating the object model incorrectly, or just a limitation\n# of the model.  But I do see some gain.\n\n# forked from: https://www.kaggle.com/shujian/ml-model-example-with-train-test\n# loading spaCy coref extension like: https://www.kaggle.com/ryches/applying-spacy-coreference-but-nothing-goes-right",No,5,76.0
#os.system(f'ls {mf}1'),No,5,76.0
"#fig, (axis1) = plt.subplots(1,1,figsize=(8,3))\n#sns.countplot(x = 'Open', hue = 'DayOfWeek', data = data_train,)",No,5,76.0
"#sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = data_train, size = 3, kind =""box"", palette =""muted"")",No,5,76.0
"#sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = data_train, size = 3, kind =""box"", palette =""muted"")",No,5,76.0
"#sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = data_train, size = 4, kind =""bar"", palette =""muted"")",No,5,76.0
"#fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize=(12,3))\n#sns.barplot(average_store_type.index, average_store_type['Sales'], ax=axis1)\n#sns.barplot(average_store_type.index, average_store_type['Customers'], ax=axis2)\n#sns.barplot(average_store_type.index, average_store_type['CompetitionDistance'], ax=axis3)",No,5,76.0
"#fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,3))\n#sns.barplot(average_assort.index, average_assort['Sales'], ax=axis1)\n#ns.barplot(average_assort.index, average_assort['Customers'], ax=axis2)",No,5,76.0
"#    (   )\n# res_list_depth = []\n# res_list_nestim = []\n# for i in range (1, 1000, 50):\n#     store_part = train_stores[i]\n#     X_train_part = store_part.drop([""Sales"", ""Store"", ""Customers""],axis=1)\n#     Y_train_part = store_part[""Sales""]\n#     X_train_part = X_train_part.fillna(X_train_part.mean())\n#     estimator = RandomForestRegressor(random_state=42, criterion = \",No,5,76.0
"#    : 0.14930 - pub, 0.13491 - priv",No,5,76.0
#df_store[pd.isnull(df_store.Promo2SinceWeek)]\n#df_store[pd.isnull(df_store.Promo2SinceWeek)& (df_store.Promo2==0)],No,5,76.0
"#!kaggle competitions submit -c rossmann-store-sales -f rossmann_submission.csv -m ""rossman with extra features""",No,5,76.0
#train_sales = np.log(train_sales),No,5,76.0
#preds = np.exp(predictions),No,5,76.0
"\n#df_train_store[['StateHoliday', 'StoreType', 'Assortment']] = df_train_store[['StateHoliday', 'StoreType', 'Assortment']].apply(lambda x: x.cat.codes)",No,5,76.0
"#This is the best combination i got from what i propose to try out with a (mse) score of 0.855 which is quite good\n#grid.best_params_,grid.best_score_\n#MY BEST PARAMS ARE :n_estimators=128,max_depth=20,min_samples_split=10",No,5,76.0
# now using Xgb ,No,5,76.0
"#X_train, X_valid, y_train, y_valid = train_test_split(x_train, y, train_size=0.8, test_size=0.2,\n #                                                               random_state=0)",No,5,76.0
# store_rows[store_rows['Sales']==0],No,5,76.0
# store.isna.sum(),No,5,76.0
# Decision tress - label encoding should be used.\n# regression - one hot encoding must be used.,No,5,76.0
# submitting the train on test data set,No,5,76.0
"\n# parameters={'max_depth':list(range(5,20))}\n# base_model=DTR()\n# cv_model=GridSearchCV(base_model,param_grid=parameters,cv=5,return_train_score=True).fit(X_train,y_train)\n",No,5,76.0
# cv_model.best_params_,No,5,76.0
"# cv_results_1=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)\n# cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)\n# cv_results_1.set_index('param_max_depth')['mean_test_score'].plot.line()\n# cv_results_1.set_index('param_max_depth')['mean_train_score'].plot.line()\n# plt.legend(['test','train'])",No,5,76.0
#!pip install pydotplus,No,5,76.0
"# def draw_tree(model, columns):\n#     import pydotplus\n#     from sklearn.externals.six import StringIO\n#     from IPython.display import Image\n#     import os\n#     from sklearn import tree\n    \n#     graphviz_path = \",No,5,76.0
"# draw_tree(model_dtr,data_merged.columns.drop(['Sales','Date']))",No,5,76.0
#df['Date'].dt.strftime('%a'),No,5,76.0
"# from sklearn.model_selection import GridSearchCV\n\n# parameters={'max_depth':list(range(5,20))}   # parmeters{'max_depth':list(range(5,20),'min_sample_split':[5,10,20])}\n# base_model=DecisionTreeRegressor()\n# cv_model=GridSearchCV(base_model,param_grid=parameters,cv=5,return_train_score=True).fit(train_x,train_y)",No,5,76.0
"# df_cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)[['param_max_depth','mean_test_score','mean_train_score']]\n# plt.figure(figsize=(10,5))\n# df_cv_results.set_index('param_max_depth')['mean_test_score'].plot.line()\n# df_cv_results.set_index('param_max_depth')['mean_train_score'].plot.line()\n# print(df_cv_results)\n",No,5,76.0
"# learn.fit_one_cycle(5, 5e-4, wd=0.1)\n# learn.recorder.plot_losses()",No,5,76.0
# #colab\n# # google-drive-ocamlfuse\n# # https://github.com/astrada/google-drive-ocamlfuse\n# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools\n# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n# !apt-get update -qq 2>&1 > /dev/null\n# !apt-get -y install -qq google-drive-ocamlfuse fuse\n\n# # ColabAuth token\n# from google.colab import auth\n# auth.authenticate_user()\n\n# # Drive FUSE librarycredential\n# from oauth2client.client import GoogleCredentials\n# creds = GoogleCredentials.get_application_default()\n# import getpass\n# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n# vcode = getpass.getpass()\n# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n\n# !mkdir -p drive\n# !google-drive-ocamlfuse -o nonempty drive,No,5,76.0
# !pip install kaggle\n# !echo \,No,5,76.0
# !ls\n# !unzip store.csv.zip\n# !unzip train.csv.zip\n# !unzip test.csv.zip,No,5,76.0
"# !pip install kaggle\n# !kaggle competitions submit -c rossmann-store-sales -f submission.csv -m ""submision""",No,5,76.0
# from google.colab import files\n# files.download('out.csv') \n# files.download('weights_rossmann.best.hdf5') \n,No,5,76.0
# !rsync -avz --progress ./model/model_both_a_13.pkl ../drive/Job/,No,5,76.0
#joined.to_pickle(PATH/'joined')\n#joined_test.to_pickle(PATH/'joined_test'),No,5,76.0
#df.to_pickle(PATH/'df'),No,5,76.0
#joined = pd.read_pickle(PATH/'joined')\n#joined_test = pd.read_pickle(PATH/f'joined_test'),No,5,76.0
#joined.to_pickle(path/'train_clean')\n#joined_test.to_pickle(path/'test_clean'),No,5,76.0
"result = pd.concat([test_dataset.ForecastId,output_confirmed_cases_df, output_fatalities_df], axis=1)\n# result.index.names = ['indexes']\n# result.columns.name = result.index.name\n# result.index.name = None\nresult\n",No,4,76.0
"# #Normalizing\n\n# no = 1\n\n# X[""PRI_jet_all_pt""]=((X[""PRI_jet_all_pt""]-X[""PRI_jet_all_pt""].min())/(X[""PRI_jet_all_pt""].max()-X[""PRI_jet_all_pt""].min()))*no\n# X_test[""PRI_jet_all_pt""]=((X_test[""PRI_jet_all_pt""]-X_test[""PRI_jet_all_pt""].min())/(X_test[""PRI_jet_all_pt""].max()-X_test[""DER_mass_MMC""].min()))*no\n\n# X[""PRI_jet_subleading_pt""]=((X[""PRI_jet_subleading_pt""]-X[""PRI_jet_subleading_pt""].min())/(X[""PRI_jet_subleading_pt""].max()-X[""PRI_jet_subleading_pt""].min()))*no\n# X_test[""PRI_jet_subleading_pt""]=((X_test[""PRI_jet_subleading_pt""]-X_test[""PRI_jet_subleading_pt""].min())/(X_test[""PRI_jet_subleading_pt""].max()-X_test[""PRI_jet_subleading_pt""].min()))*no\n\n# X[""PRI_jet_leading_pt""]=((X[""PRI_jet_leading_pt""]-X[""PRI_jet_leading_pt""].min())/(X[""PRI_jet_leading_pt""].max()-X[""PRI_jet_leading_pt""].min()))*no\n# X_test[""PRI_jet_leading_pt""]=((X_test[""PRI_jet_leading_pt""]-X_test[""PRI_jet_leading_pt""].min())/(X_test[""PRI_jet_leading_pt""].max()-X_test[""PRI_jet_leading_pt""].min()))*no\n\n# X[""PRI_met_sumet""]=((X[""PRI_met_sumet""]-X[""PRI_met_sumet""].min())/(X[""PRI_met_sumet""].max()-X[""PRI_met_sumet""].min()))*no\n# X_test[""PRI_met_sumet""]=((X_test[""PRI_met_sumet""]-X_test[""PRI_met_sumet""].min())/(X_test[""PRI_met_sumet""].max()-X_test[""PRI_met_sumet""].min()))*no\n\n# X[""DER_sum_pt""]=((X[""DER_sum_pt""]-X[""DER_sum_pt""].min())/(X[""DER_sum_pt""].max()-X[""DER_sum_pt""].min()))*no\n# X_test[""DER_sum_pt""]=((X_test[""DER_sum_pt""]-X_test[""DER_sum_pt""].min())/(X_test[""DER_sum_pt""].max()-X_test[""DER_sum_pt""].min()))*no\n\n# X[""DER_mass_jet_jet""]=((X[""DER_mass_jet_jet""]-X[""DER_mass_jet_jet""].min())/(X[""DER_mass_jet_jet""].max()-X[""DER_mass_jet_jet""].min()))*no\n# X_test[""DER_mass_jet_jet""]=((X_test[""DER_mass_jet_jet""]-X_test[""DER_mass_jet_jet""].min())/(X_test[""DER_mass_jet_jet""].max()-X_test[""DER_mass_jet_jet""].min()))*no\n\n# X[""DER_pt_h""]=((X[""DER_pt_h""]-X[""DER_pt_h""].min())/(X[""DER_pt_h""].max()-X[""DER_pt_h""].min()))*no\n# X_test[""DER_pt_h""]=((X_test[""DER_pt_h""]-X_test[""DER_pt_h""].min())/(X_test[""DER_pt_h""].max()-X_test[""DER_pt_h""].min()))*no\n\n# X[""DER_mass_vis""]=((X[""DER_mass_vis""]-X[""DER_mass_vis""].min())/(X[""DER_mass_vis""].max()-X[""DER_mass_vis""].min()))*no\n# X_test[""DER_mass_vis""]=((X_test[""DER_mass_vis""]-X_test[""DER_mass_vis""].min())/(X_test[""DER_mass_vis""].max()-X_test[""DER_mass_vis""].min()))*no\n\n# X[""DER_mass_transverse_met_lep""]=((X[""DER_mass_transverse_met_lep""]-X[""DER_mass_transverse_met_lep""].min())/(X[""DER_mass_transverse_met_lep""].max()-X[""DER_mass_transverse_met_lep""].min()))*no\n# X_test[""DER_mass_transverse_met_lep""]=((X_test[""DER_mass_transverse_met_lep""]-X_test[""DER_mass_transverse_met_lep""].min())/(X_test[""DER_mass_transverse_met_lep""].max()-X_test[""DER_mass_transverse_met_lep""].min()))*no\n\n# X[""DER_mass_MMC""]=((X[""DER_mass_MMC""]-X[""DER_mass_MMC""].min())/(X[""DER_mass_MMC""].max()-X[""DER_mass_MMC""].min()))*no\n# X_test[""DER_mass_MMC""]=((X_test[""DER_mass_MMC""]-X_test[""DER_mass_MMC""].min())/(X_test[""DER_mass_MMC""].max()-X_test[""DER_mass_MMC""].min()))*no\n\n\n# X.head()",No,5,76.0
# # normalize the data attributes\n# X = X.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n\n# X_test = X_test.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n\n\n# X.head(),No,5,76.0
"#X = X.replace(-999.000,np.nan)\n#X.head()",No,5,76.0
"#X_test = X_test.replace(-999.000,np.nan)",No,5,76.0
#X_test.head(),No,5,76.0
"#X = X.replace(-999.000,0)\n#X_test = X_test.replace(-999.000,0)\n#X.head()",No,5,76.0
"#X.fillna(X.median(), inplace=True)\n#X_test.fillna(X_test.median(), inplace=True)\n\n#X.head()",No,5,76.0
#X.tail(1000),No,5,76.0
"# train_set = train_set.drop(['Soil_Type'+str(idx) for idx in range(1, 41)], axis=1)\n# train_set = train_set.drop(['Wilderness_Area'+str(idx) for idx in range(1, 5)], axis=1)",No,5,76.0
"#Final submission score is 0.67,which needs to be improved.",Yes,5,76.0
"#Soil_Type7,Soil_Type15 has 0 standard deviation\n#train = train.drop([""Soil_Type7"",""Soil_Type15""],axis = 1)\n#test = test.drop([""Soil_Type7"",""Soil_Type15""],axis = 1)",No,5,76.0
"#of the 3 algorithms applied to the dataset,ensemble model works better with a score of 0.84\n",No,5,76.0
#Getting feature importance after running the data through a ensemble model classifier,No,5,76.0
"#Both the models have same important features,also these important features completely ignores the soil type feature.\n#This should be included,except Soil_Type7,Soil_Type15 has lower standard deviation.\n\n",No,5,76.0
#Naive Bayes Model,No,5,76.0
"#SGDClassifier,very low performance",No,5,76.0
#df_test1.head(),No,5,76.0
#y_pred = classifier.predict(df_test1),No,5,76.0
"#solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':y_pred}, columns = ['Id','Cover_Type'])\n#solution.to_csv('SVMcover_sol.csv', index=False)",Yes,5,76.0
"# gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',\n#                                                    learning_rate = 0.01,\n#                                                    n_estimators = 100,\n#                                                    max_depth = 30,\n#                                                    random_state=10)\n\n# gradientBoostingModel.fit(X_train,y_train)",No,5,76.0
"# SGDClassifier = SGDClassifier(loss = 'hinge', \n#                               penalty = 'l1',\n#                               learning_rate = 'optimal',\n#                               random_state = 10, \n#                               max_iter=100)\n\n# SGDClassifier.fit(X_train,y_train)",No,5,76.0
"# SVClassifier = SVC(kernel= 'linear',\n#                    degree=3,\n#                    max_iter=10000,\n#                    C=2, \n#                    random_state = 55)\n\n# SVClassifier.fit(X_train,y_train)",No,5,76.0
# Forest Cover Prediction,Yes,5,76.0
#estimator.get_params().keys(),No,5,76.0
"##parameters_grid = {\n##    'model_fitting__n_estimators' : [70, 100, 130],\n##    'model_fitting__max_features' : [3, 4, 5, 6],\n##}\n##\n##grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring = 'neg_mean_absolute_error', cv = 3)\n##grid_cv.fit(train_data, train_labels)\n##\n##print(-grid_cv.best_score_)\n##print(grid_cv.best_params_)",No,5,76.0
#union_data['hour_type']=0\n#union_data['hour_type'][(union_data['hour']<=1)]='1'\n#union_data['hour_type'][(union_data['hour']>=2)& (union_data['hour']<=4)]='2'\n#union_data['hour_type'][(union_data['hour']>=4)& (union_data['hour']<=6)]='3'\n#union_data['hour_type'][(union_data['hour']>=6)& (union_data['hour']<=8)]='4'\n#union_data['hour_type'][(union_data['hour']>=9)& (union_data['hour']<=15)]='5'\n#union_data['hour_type'][(union_data['hour']>=16)& (union_data['hour']<=18)]='6'\n#union_data['hour_type'][(union_data['hour']>=19)& (union_data['hour']<=20)]='7'\n#union_data['hour_type'][(union_data['hour']>=21)]='8'\n\n,No,5,76.0
"# It is undeniable that working days have a distinct behavior on their own. So we assume that the separation non working days vs working days is a valid one. However,\n# Looking at the non working days the conclusion is less obvious. We will assume that for seasons 3 and 4 we have a distinct pattern for non working days and leave a \n# possible refinement to a later version. Lets see if we can find a separation for weekends and public holidays for seasons 1 and 2.\n\n# In what follows we will repeat the steps above but now, instead of X_train we will have X_holiday_1, instead of X_workingday_1 we will have X_1_public_holidays \n# and X_1_weekends instead X_holiday_1. And the same for season 2. We try to figure out if this separation will increase the ration mean counts /std counts at least\n# for one of these groups",No,5,76.0
"# Conclusion: We can observe that, for season 1, by separating the public holidays from the weekends we observe a better ratio for the public holidays, while for\n# weekends this ratio is higher for only 50% of the hours, while for season 2, the conclusion is the opposite.",No,5,76.0
# from sklearn.tree import DecisionTreeClassifier\n# from sklearn.tree import DecisionTreeRegressor\n\n# # model = DecisionTreeClassifier()\n\n# # random_state          .\n# #model = DecisionTreeClassifier(random_state=37)\n# model = DecisionTreeRegressor(random_state=37)\n# model,No,5,76.0
"'''\nfor idx_train, idx_test in ms.split(df_train_data):\n    csv = linear_model.Ridge().fit(df_train_data.iloc[idx_train], \\\n                                   df_train_target.iloc[idx_train])\n    print('train score: {0: .3f}, test score: {1: .3f}'.format(\n        csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n        csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n    ))\n'''",No,5,76.0
"'''\nfor idx_train, idx_test in ms.split(df_train_data):\n    csv = svm.SVR(kernel='rbf', C=10, gamma=0.001).fit(df_train_data.iloc[idx_train],\\\n                                                       df_train_target.iloc[idx_train])\n    print('train score: {0: .3f}, test score: {1: .3f}'.format(\n        csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n        csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n    ))\n'''",No,5,76.0
"#df_train_data_notime = df_train_data.drop(['hour', 'dayofweek', 'month'], axis=1)",No,5,76.0
"# for idx_train,idx_test in ms.split(df_train_data):\n#     csv = RandomForestRegressor(n_estimators=500).fit(df_train_data.iloc[idx_train],\\\n#                                                      df_train_target.iloc[idx_train])\n#     print('train score: {0: .3f}, test score: {1: .3f}'.format(\n#         csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n#         csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n#     ))",No,5,76.0
"'''\nfor idx_train,idx_test in ms.split(df_train_data):\n    csv = RandomForestRegressor(n_estimators=100).fit(df_train_data_notime.iloc[idx_train],\\\n                                                     df_train_target.iloc[idx_train])\n    print('train score: {0: .3f}, test score: {1: .3f}'.format(\n        csv.score(df_train_data_notime.iloc[idx_train], df_train_target.iloc[idx_train]),\n        csv.score(df_train_data_notime.iloc[idx_test], df_train_target.iloc[idx_test])\n    ))\n'''",No,5,76.0
"# estimator2 = RandomForestRegressor(n_estimators=200, max_features=0.6, max_depth=15)\n# plot_learning_curve(estimator2, title, \n#                    df_train_data, df_train_target, ylim=(0.7, 1.01), cv=cv, n_jobs=4)\n# plt.show()",No,5,76.0
#df_sample['count'] = df_sample['count'].apply(lambda x: int(x + 0.5)),No,5,76.0
####,No,5,76.0
"# sns.factorplot(x=""month"",y=""count"",data=train_set,kind=\",No,5,76.0
# train_set['high_time'] = np.zeros_like(train_set['time'])\n# train_set['high_time'].loc[(((train_set['time'] > 6) & (train_set['time'] < 15)) | (train_set['time'] == 20))] = 1\n# train_set['high_time'].loc[((train_set['time'] == 8) | (train_set['time'] == 16) | (train_set['time'] == 19))] = 2\n# train_set['high_time'].loc[((train_set['time'] == 17) | (train_set['time'] == 18))] = 3,No,5,76.0
"# def RMSLE(y_hat, data):\n#     y_true = data.get_label()\n#     y_hat = np.round(y_hat)\n#     y_hat[y_hat<0]=0\n#     return 'rmlse', np.sqrt(mean_squared_log_error(y_true, y_hat)), True",No,5,76.0
"# d_train = lgb.Dataset(X, label=y)\n# params = {'objective': 'regression', 'metric': 'rmsle', 'random_state': 501, 'verbose': 0, 'reg_alpha ': 0.1, 'reg_lambda': 0.1}",No,5,76.0
"# lgb_cv = lgb.cv(\n#             params, \n#             d_train,\n#             metrics = 'rmsle',\n#             feval= RMSLE,\n#             nfold=5,\n#             verbose_eval = 5)",No,5,76.0
"# lgb_model = lgb.train(\n#             params, \n#             d_train,\n#             feval= RMSLE,\n#             verbose_eval = 5)",No,5,76.0
"# d_importance = pd.DataFrame(columns=['features'], data=X.columns)\n# d_importance['gain_importance'] = lgb_model.feature_importance(importance_type='gain')\n# d_importance['split_importance'] = lgb_model.feature_importance(importance_type='split')\n# d_importance.sort_values(by='gain_importance',ascending=False).head(25)",No,5,76.0
"# xgb_model = XGBRegressor(colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7, random_state=42)\n# xgb_model.fit(X, y)",No,5,76.0
"# def rmsle(y_true, y_hat):\n#     y_hat = np.round(y_hat)\n#     y_hat[y_hat<0]=0\n#     return np.sqrt(mean_squared_log_error(y_true, y_hat))\n\n# rmsle_score = make_scorer(rmsle, greater_is_better=False)",No,5,76.0
"# scores = cross_val_score(xgb_model, X, y, cv=5, scoring=rmsle_score)\n# print(""scores "", np.abs(scores))",No,5,76.0
"# d_importance = pd.DataFrame(columns=['features'], data=X.columns)\n# d_importance['importance'] = xgb_model.feature_importances_\n# d_importance.sort_values(by='importance',ascending=False).head(20)",No,5,76.0
"#grid_params = {'max_depth' : [12,14,16]}\n#grid_xgb = GridSearchCV(xgb_clf, grid_params, cv= 5)\n#grid_xgb.fit(x_train, y_train)\n#print(grid_xgb.best_score_)\n#grid_xgb.cv_results_\n#grid_xgb.score(x_test, y_test)",No,5,76.0
#!rm submission.csv,No,5,76.0
"# features_cyc = ['hour', 'weekday']\n# for feature in features_cyc:\n#     train_data[feature+'_sin'] = np.sin((2*np.pi*train_data[feature])/max(train_data[feature]))\n#     train_data[feature+'_cos'] = np.cos((2*np.pi*train_data[feature])/max(train_data[feature]))\n#     test_data[feature+'_sin'] = np.sin((2*np.pi*test_data[feature])/max(test_data[feature]))\n#     test_data[feature+'_cos'] = np.cos((2*np.pi*test_data[feature])/max(test_data[feature]))\n# train_data = train_data.drop(features_cyc, axis=1)\n# test_data = test_data.drop(features_cyc, axis=1)",No,5,76.0
"# TIP)        .\n# time = train['datetime'].str.slice(11,13).astype(int)\n# time.head()\n",No,5,76.0
"#   validate dataset  7:3\n#from sklearn.model_selection import train_test_split\n#train_x, validate_x, train_y, validate_y  = train_test_split(train, y, test_size = 0.3,\n                                                             #random_state = 777)",No,5,76.0
"#   .    ,       .\n#from sklearn.ensemble import RandomForestRegressor\n#  ,       . n_estimator           .\n# cpu      1 . n_jobs=4  -1  CPU  \n# random_state set.seed() \n#rf = RandomForestRegressor(n_estimators=100, n_jobs=-1,random_state=999)\n#rf.fit(train, y)",No,5,76.0
#result = rf.predict(test)\n                                                                               ,No,5,76.0
"#from lightgbm import LGBMRegressor\n# boosting  hyper parameter   (,  )\n#lgbm = LGBMRegressor()\n#lgbm.fit(train, y)\n",No,5,76.0
#preds = lgbm.predict(test),No,5,76.0
"#  0   .\n#      ,        .\n# train.loc[train[""windspeed""] == 0, ""windspeed""] = train[""windspeed""].mean()\n# test.loc[train[""windspeed""] == 0, ""windspeed""] = train[""windspeed""].mean()",No,5,76.0
# realizando as tranformaes nos dados,No,5,76.0
"'''\nfrom sklearn.ensemble import RandomForestRegressor\nrf=RandomForestRegressor(n_estimators=100,random_state=0)\nrf.fit(X,Y)\nimp_list=rf.feature_importances_\nfeats = {} # a dict to hold feature_name: feature_importance\nfor feature, importance in zip(final_df.columns, rf.feature_importances_):\n    feats[feature] = importance #add the name/value pair\n''' ",No,5,76.0
"#month => train   count  test  count   .  , \n#  .  train  \n# test  .   20~31 . ",No,5,76.0
160/3251,No,3,76.0
"# scores.mean(), scores",No,5,76.0
"# from sklearn.cross_validation import cross_val_predict\n# y_pred = cross_val_predict(LogisticRegression(), X, Y, cv=10, n_jobs=-1, verbose=1)\n# log_loss(Y, y_pred)",No,5,76.0
"# from sklearn.model_selection import StratifiedKFold\n# kf = StratifiedKFold(n_splits=10, random_state=0)\n# pred = np.zeros((Y.shape[0], Y.nunique()))\n# for train_index, test_index in kf.split(X, Y):\n#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n#     y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]\n#     lr = LogisticRegression(solver=\",No,5,76.0
"# from sklearn.ensemble import RandomForestClassifier\n# scores = cross_val_score(RandomForestClassifier(n_estimators=100), X, Y, scoring='neg_log_loss',cv=10, verbose=1)",No,5,76.0
"# scoresmean(), scores.",No,5,76.0
"#del train,valid,testdata\n#gc.collect()",No,5,76.0
"#clf = LogisticRegression(C=0.02)\n#clf.fit(Xtrain, y)\n#clf.predict_proba(Xtrain[70000:], y[70000:])\n#log_loss(yte, pred[itest, :])",No,5,76.0
"#pred = clf.predict_proba(Xtrain[70000:])\n#log_loss(y[70000:], pred)",No,5,76.0
"#pred = pd.DataFrame(clf.predict_proba(Xtest), index=ga_test.index, columns=target_encoder.classes_)\n#pred.head()\n#pred.to_csv('logreg_subm.csv',index=True)",No,5,76.0
##train_users[train_users['id'] == 'bibf93h56j']\n##train_users['date_first_booking'].isnull(),No,5,76.0
##train_users.head()\n##train_users[train_users['first_browser_grouped'] == 'Mobile']\n\n#### language doesn't appear that helpful.. anyway we can adjust it some?\n\n#train_users.head(),No,5,76.0
\n#test = [0]\n#train_users[\,No,5,76.0
"#fig, (axis1, axis2) = plt.subplots(2,1,figsize=(15,10))\n#sns.countplot(x=\",No,5,76.0
"\n\n#fig, (axis1) = plt.subplots(1,1,figsize=(15,5))\n#sns.countplot(x=\",No,5,76.0
##### is it worthwhile to group up some of these X vars w/ a lot of subclasses? \n,No,5,76.0
"# from sklearn.model_selection import cross_val_score\n# from sklearn.ensemble import GradientBoostingClassifier\n\n# np.random.seed(42)\n# samples = np.random.choice(piv_train, 10000)\n# X_train = vals[samples]\n# y_train = le.fit_transform(labels)[samples]\n# model = GradientBoostingClassifier()\n# cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)",No,5,76.0
"# Too Much Unknown Data In Columns : Gender & First Browser , will need to fix that later\n# Now Let's Focus on the Dates Data",No,5,76.0
#Building the Classfication Model,No,5,76.0
# Using Submession System To Evaluate The Model,No,5,76.0
"# #trust your CV!\n# best_parameters,score = max(clf.scorer_, key=lambda x: x[1])\n# print(\",No,5,76.0
"# fig, ax = plt.subplots(5, 2, figsize=(10, 50))\n# for i in range(5):\n#     ax[i, 0].imshow(X_train_zero[i])\n#     ax[i, 1].imshow(Y_train_zero[i])\n#     print(np.unique(Y_train_zero[i]))",No,5,76.0
"# fig, ax = plt.subplots(5, 2, figsize=(10, 50))\n# for i in range():\n#     ax[i, 0].imshow(X_train_one[i], 'gray')\n#     Y_train_one = np.array(Y_train_one, dtype='bool')\n# #     Y_train_one[i][Y_train_one]=1\n#     ax[i, 1].imshow(Y_train_one[i], 'gray')\n#     print(np.unique(Y_train_one[i]))",No,5,76.0
"# for img, mask in zip(X_train_zero, Y_train_zero):\n#     X_train.append(img)\n#     Y_train.append(mask)",No,5,76.0
"# X_train_all = np.concatenate((X_train, X_train_zero[:1000]), axis=0)\n# Y_train_all = np.concatenate((Y_train, Y_train_zero[:1000]), axis=0)",No,5,76.0
"# model = Unet('densenet121',encorder_weights='imagenet',freeze_encorder=True)",No,5,76.0
"# N = 1\n\n# base_model = Unet(backbone_name='resnet34', encoder_weights='imagenet')\n\n# inp = Input(shape=(None, None, N))\n# l1 = Conv2D(3, (1, 1))(inp) # map N channels data to 3 channels\n# out = base_model(l1)\n\n# model = Model(inp, out, name=base_model.name)\n# model.compile(optimizer=Adam(lr = 1e-5), loss=dice_coef_loss, metrics=[dice_coef])",No,5,76.0
#embedding_dim = len(tokenizer.word_index)+1,No,5,76.0
"#my_submissionxgb = pd.DataFrame({'id': test.id, 'country':resultsxgb})\n#my_submissionxgb.to_csv('submissionxgb.csv', index=False)",No,5,76.0
"# vocab = vectorizer.get_feature_names()\n# dist = np.sum(train_data_features, axis=0)\n# for tag, count in zip(vocab, dist):\n#     print (count, tag)",No,5,76.0
"# test = pd.read_csv(""../input/test.tsv"", sep=\",No,5,76.0
"#train_data = train.drop(['Phrase'], axis=1)\n#test_data = test.drop(['Phrase'], axis=1)",No,5,76.0
"#np.savetxt(""outpu.csv"", y1, delimiter="","")\n",No,5,76.0
"#train[""day of week""].value_counts()",No,5,76.0
"#grid_search.fit(X_train, y_train)",No,5,76.0
#!pip install xgboost,No,5,76.0
#from sklearn.model_selection import GridSearchCV\n#from sklearn.model_selection import ShuffleSplit,No,5,76.0
"'''\nxgb1 = xg_reg\nparameters = {'nthread':[3], #when use hyperthread, xgboost may become slower\n              'objective':['reg:linear'],\n              'learning_rate': [.03, 0.05, .07], #so called `eta` value\n              'max_depth': [5, 6, 7],\n              'min_child_weight': [4],\n              'silent': [1],\n              'subsample': [0.7],\n              'colsample_bytree': [0.7],\n              'n_estimators': [500]}\n\nxgb_grid = GridSearchCV(xgb1,\n                        parameters,\n                        cv = 2,\n                        n_jobs = 3,\n                        verbose=True)\n\nxgb_grid.fit(X_train,\n         y_train)\n\nprint(xgb_grid.best_score_)\nprint(xgb_grid.best_params_)\n'''",No,5,76.0
"'''xg_reg = xgb.XGBRegressor(colsample_bytree= 0.7, learning_rate= 0.07, max_depth= 5, min_child_weight= 4, n_estimators= 300, nthread= 4, objective= 'reg:linear', silent= 1, subsample=0.7)'''",No,5,76.0
"'''xg_reg.fit(X_train,y_train)\n\nresult = xg_reg.predict(X_test)'''",No,5,76.0
"# sub_prev_year_median.to_csv('submission_prev_year.csv',index=False)\n# FileLink('submission_prev_year.csv')",No,5,76.0
"# sub_prev_year_median.to_csv('sub_median_60.csv',index=False)\n# FileLink('sub_median_60.csv')",No,5,76.0
"#world_population = pd.read_csv(""/kaggle/input/population-by-country-2020/population_by_country_2020.csv"")\n#display(world_population.head()) #for next round",No,5,76.0
"# ## Add 1-year `Weekly_Sales` lag ##\n\n# X_all = pd.concat([X_train, X_test])\n# X_all['Date2'] = pd.to_datetime(X_all['Date'], utc = True)\n# X_all['52_Week_Lag'] = X_all['Date2'] - np.timedelta64(52,'W')\n# X_all_temp = X_all[['Weekly_Sales', 'Date2', 'Store', 'Dept']]\n\n# X_all = X_all.merge(X_all_temp,\n#                     left_on=['Store', 'Dept', '52_Week_Lag'], \n#                     right_on=['Store', 'Dept', 'Date2'],\n#                     how='inner',\n#                     suffixes=('', '_y'))\n# X_all.rename(columns={'Weekly_Sales_y': 'Weekly_Sales_Lag_52_Weeks'}, inplace=True)\n# X_all = X_all[[col for col in X_all.columns if not col.endswith('_y')]]\n\n# drop_cols = ['Date2_y', '1_Year_Lag']\n# X_all.drop(['52_Week_Lag'], axis=1, inplace=True)\n\n# X_all.isna().sum()",No,5,76.0
"# X_train['Date2'] = pd.to_datetime(X_train['Date'], utc = True)\n# X_test['Date2'] = pd.to_datetime(X_test['Date'], utc = True)\n\n# X_train['Weekly_Sales_Lag_52_Weeks'] = X_train.merge(X_all, \n#                                                    left_on=['Store', 'Dept', 'Date2'], \n#                                                    right_on=['Store', 'Dept', 'Date2'],\n#                                                    how='inner')['Weekly_Sales_Lag_52_Weeks']\n# X_test['Weekly_Sales_Lag_52_Weeks'] = X_test.merge(X_all, \n#                                                  left_on=['Store', 'Dept', 'Date2'], \n#                                                  right_on=['Store', 'Dept', 'Date2'],\n#                                                  how='inner')['Weekly_Sales_Lag_52_Weeks']\n\n# X_test.head()",No,5,76.0
"# sns.set(style=""ticks"", color_codes=True)\n\n# for col in X_train.columns.drop(\",No,5,76.0
## takes too long ##,No,5,76.0
# hist = pd.DataFrame(history.history)\n# hist.plot(),No,5,76.0
"## create 1-year lag value of store sales (can't use 1-week as test set doesn't have any `Weekly_Sales` ##\n## X_train ends at 2012-10-26 and X_test ends at 2013-07-26, so no NaN values for `Weekly_Sales_Lag` in X_test ##",No,5,76.0
"# X_all = pd.concat([X_train, X_test])\n# X_all.tail()",No,5,76.0
"### GridSearchCV test
#X_test, _ = loadData(test, test = True)
#prediction = grid_search.predict(X_test)",No,5,76.0
"### RF validation
'''
X_train, X_test, y_train, y_test = train_test_split(X, new_y, test_size = 0.33, random_state = 42)
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
prediction = rf.predict(X_test)
mean_squared_error(y_test, prediction)
'''",No,5,76.0
"# standizer = StandardScaler()

# data[np.array(data.columns[:])] = standizer.fit_transform(data[np.array(data.columns[:])])
# test[np.array(test.columns[:])] = standizer.transform(test[np.array(test.columns[:])])
",No,5,76.0
"# rbs = RobustScaler()
    
# data[np.array(data.columns[:])] = rbs.fit_transform(data[np.array(data.columns[:])])
# test[np.array(test.columns[:])] = rbs.fit_transform(test[np.array(test.columns[:])])",No,5,76.0
"# pca = PCA()

# data[np.array(data.columns[:])] = pca.fit_transform(data[np.array(data.columns[:])])
# test[np.array(test.columns[:])] = pca.fit_transform(test[np.array(test.columns[:])])",No,5,76.0
"# krc = KernelCenterer()

# data[np.array(data.columns[:])] = krc.fit_transform(data[np.array(data.columns[:])])
# test[np.array(test.columns[:])] = krc.transform(test[np.array(test.columns[:])])",No,5,76.0
"# kf = KFold(n_splits=5,shuffle=True)

# random_forest_acc = 0
# adaboost_acc = 0
# extraRandom_acc = 0
# svm_acc = 0
# gradientBoosting_acc = 0 

# for train_index, test_index in kf.split(data):

#     X_train = data.filter(items=train_index, axis=0)
#     X_test = data.filter(items=test_index, axis=0)
    
#     y_train = input_label[train_index]
#     y_test = input_label[test_index]

# #   for randomForest  
#     random_forest_clf = RandomForestClassifier(n_estimators=50)
#     random_forest_clf.fit(X_train, y_train)
#     rand_given_labels = random_forest_clf.predict(X_test)
#     random_forest_acc += accuracy_score(y_test, rand_given_labels)
    
# #   for AdaBoost  
#     adaboost_clf = AdaBoostClassifier(n_estimators = 100,learning_rate=0.5)
#     adaboost_clf.fit(X_train, y_train)
#     ada_given_labels = adaboost_clf.predict(X_test)
#     adaboost_acc += accuracy_score(y_test, ada_given_labels)
# #   for extra random forest  
#     extraRandom= ExtraTreesClassifier(n_estimators=100, max_depth=None,min_samples_split=2)
#     extraRandom.fit(X_train, y_train)
#     xrand_given_labels = extraRandom.predict(X_test)
#     extraRandom_acc += accuracy_score(y_test, xrand_given_labels)    
# #   for gradient boosting
#     gradientBoosting_clf = GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=1)
#     gradientBoosting_clf.fit(X_train, y_train)
#     gradientBoosting_given_labels = gradientBoosting_clf.predict(X_test)
#     gradientBoosting_acc += accuracy_score(y_test, gradientBoosting_given_labels)    
# #   for svm
#     svm_clf = svm.SVC(C= 0.1 , kernel='linear')
#     svm_clf.fit(X_train, y_train)
#     svm_given_labels = svm_clf.predict(X_test)
#     svm_acc += accuracy_score(y_test, svm_given_labels)    
    ",No,5,76.0
"#feature selection
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# select = SelectKBest(chi2, k=6)
# train = select.fit_transform(train, train_labels)
# test = select.transform(test)

# from sklearn.feature_selection import VarianceThreshold
# sel = VarianceThreshold(threshold=0.1)
# selFeature=sel.fit_transform(train)",No,5,76.0
"# #linearSVM (1)

# linearSVM_clf = svm.SVC(kernel='linear', C=1).fit(train_normalized,train_labels)
# #acc1=cross_val_score(clf, train_normalized, train_labels, cv=20, scoring='accuracy')

# trainpred=linearSVM_clf.predict(train_normalized)
# testpred=linearSVM_clf.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))

# # print(acc1)
# # print(np.mean(acc1))


# #results.append(clf.predict(test_normalized))
",No,5,76.0
"# #rbfSVM (2)
# rbfSVM_clf = svm.SVC(kernel='rbf', C=1).fit(train_normalized,train_labels)

# #rbfSVM_acc=cross_val_score(rbfSVM_clf, train_normalized, train_labels, cv=20, scoring='accuracy')

# trainpred=rbfSVM_clf.predict(train_normalized)
# testpred=rbfSVM_clf.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))

# # print(rbfSVM_acc)
# # print(np.mean(rbfSVM_acc))

# #results.append(rbfSVM_clf.predict(test_normalized))
",No,5,76.0
"# #logReg (5)
# from sklearn.linear_model import LogisticRegression

# logReg= LogisticRegression().fit(train_normalized,train_labels)

# trainpred=logReg.predict(train_normalized)
# testpred=logReg.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))

# # logreg_acc=cross_val_score(logReg,train_normalized,train_labels,cv=10,scoring='accuracy')

# # print(logreg_acc)
# # print(np.mean(logreg_acc))

# #results.append(logReg.predict(test_normalized))",No,5,76.0
"# #NearestCentroid (6)

# from sklearn.neighbors.nearest_centroid import NearestCentroid

# NC_clf = NearestCentroid()
# NC_clf.fit(train_normalized, train_labels)

# trainpred=NC_clf.predict(train_normalized)
# testpred=NC_clf.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))

# # NC_acc=cross_val_score(NC_clf,train_normalized,train_labels,cv=10,scoring='accuracy')

# # print(logreg_acc)
# # print(np.mean(logreg_acc))

# #results.append(logReg.predict(test_normalized))",No,5,76.0
"# #SGD (11)
# from sklearn.linear_model import SGDClassifier

# sgd_clf = SGDClassifier(loss=""hinge"", penalty=""l2"").fit(train_normalized,train_labels)

# trainpred=clf.predict(train_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))
# results.append(sgd_clf.predict(test_normalized))
",No,5,76.0
"# #LDA (13)
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# LDA_clf = LinearDiscriminantAnalysis().fit(train_normalized,train_labels)

# LDA_clf.fit(train_normalized,train_labels)

# trainpred=LDA_clf.predict(train_normalized)
# testpred=LDA_clf.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))

# # #LDA_acc = cross_val_score(LDA_clf, train_normalized, train_labels, cv=10, scoring='accuracy')

# # print(LDA_acc)
# # print(np.mean(LDA_acc))

# # resultLDA = LDA_clf.predict(test_normalized)",No,5,76.0
"# #GaussianNB (14)
# from sklearn.naive_bayes import GaussianNB

# GNB_clf = GaussianNB()
# GNB_clf.fit(train_normalized,train_labels)

# trainpred=GNB_clf.predict(train_normalized)
# testpred=GNB_clf.predict(test_normalized)

# print(metrics.accuracy_score(train_labels, trainpred))
# # #GNB_acc = cross_val_score(GNB_clf, train_normalized, train_labels, cv=10, scoring='accuracy')

# # print(GNB_acc)
# # print(np.mean(GNB_acc))",No,5,76.0
"# svmodel = svm.SVC(C=1, kernel=""poly"")
# svmodel.fit(main, labels)",No,5,76.0
"# sklearn.metrics.accuracy_score(labels, svmodel.predict(features))",No,5,76.0
"# sklearn.metrics.accuracy_score(tlabels, svmodel.predict(sfeatures))",No,5,76.0
"# cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': svmodel.predict(test_principalComponenta) }
# submission = pd.DataFrame(cols)
# submission.to_csv(""submission.csv"", index=False)
# submission",No,5,76.0
"inputf = features
# frst1 = IsolationForest(n_estimators=5)
# frst1.fit(inputf, labels)
# frst2 = IsolationForest(n_estimators=5)
# frst2.fit(inputf, labels)
# frst3 = IsolationForest(n_estimators=5)
# frst3.fit(inputf, labels)
# frst4 = IsolationForest(n_estimators=5)
# frst4.fit(inputf, labels)
# frst5 = IsolationForest(n_estimators=5)
# frst5.fit(inputf, labels)
# frst6 = IsolationForest(n_estimators=5)
# frst6.fit(inputf, labels)
# frst7 = IsolationForest(n_estimators=5)
# frst7.fit(inputf, labels)
# frst8 = IsolationForest(n_estimators=5)
# frst8.fit(inputf, labels)
# frst9 = IsolationForest(n_estimators=5)
# frst9.fit(inputf, labels)
# frst10 = IsolationForest(n_estimators=5)
# frst10.fit(inputf, labels)",Yes,4,76.0
"testf = sfeatures
# pred1 = frst1.predict(testf)
# pred2 = frst2.predict(testf)
# pred3 = frst3.predict(testf)
# pred4 = frst4.predict(testf)
# pred5 = frst5.predict(testf)
# pred6 = frst6.predict(testf)
# pred7 = frst7.predict(testf)
# pred8 = frst8.predict(testf)
# pred9 = frst9.predict(testf)
# pred10 = frst10.predict(st_sfeatures)",Yes,4,76.0
"# res1 = []
# for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in zip(pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10):
#     j = np.sum([i1,i2,i3,i4,i5,i6,i7,i8,i9,i10])
#     if j >= 5:
#         res1.append(1)
#     else:
#         res1.append(0)",No,5,76.0
"# from keras.preprocessing import image
# from os import walk
# data=[]
# input_file_names=[]
# #####get the file names of the images to read them one by one
# for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/train""):
#     input_file_names=filenames

# for x in input_file_names:
#     img_file_name=x##getting name of the image file
#     path=str(""../input/train/""+img_file_name)####making proper path of the image file
#     i=image.load_img(path)####reading the image from the path 
#     i=i.resize((64,64))#####resizing the image 
#     iarray=image.img_to_array(i)####converting it to arrau
#     data.append(iarray)#####appending the image to the list
",No,5,76.0
# plt.imshow(data[5]),No,5,76.0
"# data=np.array(data)
# ####generating labels for the data 
# labels=[]
# for x in input_file_names:
#     if x.find(""cat"")>=0:
#         labels.append(0)
#     else:
#         labels.append(1)
# ###checking if the labels are properly tagged or not,both the classes have equal images 12500 each
# a=np.array(labels)
# np.unique(a,return_counts=True)

# ###reshaping the labels
# labels=a.reshape(25000,1)",No,5,76.0
"# #####rescaling the data
# data=data/255.
",No,5,76.0
"# model=Sequential()
# model.add(Conv2D(64,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100),input_shape=(64,64,3) ))
# model.add(Conv2D(64,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(BatchNormalization())
# model.add(Activation(""relu""))
# model.add(MaxPooling2D((2,2)))

# model.add(Conv2D(128,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(Conv2D(128,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(BatchNormalization())
# model.add(Activation(""relu""))
# model.add(MaxPooling2D((2,2)))

# model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(BatchNormalization())
# model.add(Activation(""relu""))
# model.add(MaxPooling2D((2,2)))


# model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100)))
# model.add(BatchNormalization())
# model.add(Activation(""relu""))
# model.add(MaxPooling2D((2,2)))


# model.add(Flatten())
# model.add(Dense(10,activation=""relu"",kernel_initializer=glorot_normal(seed=100)))
# model.add(Dense(1,activation=""sigmoid"",kernel_initializer=glorot_normal(seed=100)))


",No,5,76.0
"###train test split
# from sklearn.model_selection import train_test_split
# train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100)

",No,5,76.0
# train_y.shape,No,5,76.0
"####compiling the model
# o=optimizers.adam()
# model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o)


",No,5,76.0
"####fitting the model

# H=model.fit(train_x,train_y,epochs=16,validation_split=0.2)
",No,5,76.0
"# plt.plot(range(1,17),H.history[""acc""])
# plt.plot(range(1,17),H.history[""val_acc""])
",No,5,76.0
"#####making predictions on the test data
# preds=model.predict_classes(test_x)",No,5,76.0
# sum(preds==test_y)/len(test_y),No,5,76.0
"# test_data=[]
# input_test_file_names=[]
# #####get the file names of the images to read them one by one
# for (dirpath, dirnames, filenames) in walk(""../input/test""):
#     input_test_file_names=filenames

# for x in input_test_file_names:
#     img_file_name=x##getting name of the image file
#     path=str(""../input/test/""+img_file_name)####making proper path of the image file
#     i=image.load_img(path)####reading the image from the path 
#     i=i.resize((64,64))#####resizing the image 
#     iarray=image.img_to_array(i)####converting it to arrau
#     test_data.append(iarray)#####appending the image to the list",No,5,76.0
"# test_data=np.array(test_data)
# test_data=test_data/255.",No,5,76.0
"# test_preds=model.predict(test_data)
# test_preds=test_preds.reshape(len(test_preds))",No,5,76.0
"####as per the submission file rule only numerical part from the file wwas needed
####like 3090.jpg should be saved in as 3090
# new_input_test_file_names=[]
# for x in input_test_file_names:
#     k=int(x[0:x.find("".jpg"")])
#     new_input_test_file_names.append(k)",No,5,76.0
"# df=pd.DataFrame({'id':new_input_test_file_names,
#              'label':test_preds})",No,5,76.0
"# df.to_csv(""submission.csv"",index=False)",No,5,76.0
"# from keras.applications import resnet50
# from keras.preprocessing.image import ImageDataGenerator
# r=resnet50.ResNet50(weights='imagenet',include_top=False,input_shape=(197,197,3))",No,5,76.0
"#########33getting data in in sahpe of (197,197,3) as min reqrmnt of resnet 50
# from keras.preprocessing import image
# from os import walk
# data=[]
# input_file_names=[]
# #####get the file names of the images to read them one by one
# for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/train/""):
#     input_file_names=filenames
    
# rand_imgs_indexes=random.sample(range(0, 24999), 14000)
# new_input_file_names=[]
# ######taking only 20000 random images
# for k in rand_imgs_indexes:
#     new_input_file_names.append(input_file_names[k])

# for x in new_input_file_names:
#     img_file_name=x##getting name of the image file
#     path=str(""../input/dogs-vs-cats-redux-kernels-edition/train/""+img_file_name)####making proper path of the image file
#     i=image.load_img(path)####reading the image from the path 
#     i=i.resize((197,197))#####resizing the image 
#     iarray=image.img_to_array(i)####converting it to arrau
#     iarray=iarray/255.
#     data.append(iarray)#####appending the image to the list",No,5,76.0
"# data=np.array(data)
# ####generating labels for the data 
# labels=[]
# for x in new_input_file_names:
#     if x.find(""cat"")>=0:
#         labels.append(0)
#     else:
#         labels.append(1)
# ###reshaping the labels
# a=np.array(labels)
# labels=a.reshape(14000,1)",No,5,76.0
"# #########defining the new model by defining my own last layer
# new_model=r.output
# new_model=Flatten()(new_model)
# new_model=Dense(10)(new_model)
# new_model=Activation(""relu"")(new_model)
# new_model=Dense(1,activation=""sigmoid"")(new_model)

# final_model=Model(input=r.input,output=new_model)


",No,5,76.0
"###freezin all layers except from last 3 layers
# total_layers=len(final_model.layers)
# print(total_layers)
# for x in range(0,total_layers-4):
#     final_model.layers[x].trainable=False
    
     
# final_model.layers",No,5,76.0
"##checking if the layers have been frozen or not
# for x in range(0,total_layers):
#     print(final_model.layers[x])
#     print(final_model.layers[x].trainable)",No,5,76.0
"###train test split
# from sklearn.model_selection import train_test_split
# train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100)
",No,5,76.0
"####compiling the model
# o=optimizers.adam()
# final_model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o)",No,5,76.0
"# final_model.fit(train_x,train_y,epochs=2,validation_split=0.2)",No,5,76.0
"# predicted_test=final_model.predict(train_x)
",No,5,76.0
##########################trying vgg19 model###################################,No,5,76.0
"# final_model.save_weights(""vgg_19.h5"")",No,5,76.0
#lrf= learn.lr_find(),No,5,76.0
#learn.sched.plot_lr(),No,5,76.0
#learn.sched.plot(),No,5,76.0
#learn.save('model1'),No,5,76.0
#learn.save('model2'),No,5,76.0
#learn.load('model2'),No,5,76.0
#learn.save('model3'),No,5,76.0
"#tmpk= log_preds
#tmpk= log_preds[:,:,0]
#tmpk=tmpk.reshape(tmpk.shape[1],tmpk.shape[0])",No,5,76.0
#tmpk= [np.mean(i) for i in tmpk],No,5,76.0
#tmpk= [ np.exp(i) for i in tmpk],No,5,76.0
"# model 2 mean 5.006981e-01   std  4.964828e-01    min  1.536870e-09   [0.025943222, 0.9912974683544303]",No,5,76.0
"#model 1 mean 5.014179e-01   std 4.955358e-01     min 2.701442e-08     [0.026336912, 0.991495253164557]",No,5,76.0
"# from sklearn.ensemble import RandomForestRegressor

# rfr = RandomForestRegressor(n_estimators=300, criterion='mae', max_depth=12, n_jobs=-1, verbose=True)
# rfr.fit(X_train.values, np.log(y_train.values) + 1)

# y_hat = rfr.predict(X_test.values)
# y_hat = np.exp(y_hat) - 1

# print(f'MAE: {mae(y_test, y_hat)}')
# print(f'RMSPE: {rmspe(y_hat, y_test)}')",No,5,76.0
"# params = {'colsample_bytree': 0.7000000000000001, 
#           'eta': 0.625, 
#           'gamma': 0.8, 
#           'max_depth': 6,
#           'eval_metric': 'rmse',
#           'min_child_weight': 6.0, 
#           'n_estimators': 8.0,  # 585
#           'silent': 1,
#           'subsample': 0.9500000000000001}


# watchlist = [(xtrain, 'train'), (xtest, 'eval')]
# num_round = 10000
# xgb_regressor = xgb.train(params, xtrain, num_round, watchlist, feval=rmspe_xg,
#                           verbose_eval=10, early_stopping_rounds=50)",No,5,76.0
"# fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 20));
# xgb.plot_importance(xgb_regressor, axes)",No,5,76.0
"# print(""Validating"")
# train_probs = xgb_regressor.predict(xtest)
# indices = train_probs < 0
# train_probs[indices] = 0
# error = rmspe(np.exp(train_probs) - 1, y_test.values)
# print(\'error\', error)

# xgb_regressor = xgb.train(params, xtest, 1000, feval=rmspe_xg, xgb_model=xgb_regressor)",No,5,76.0
# print(best_opts),No,5,76.0
"#PMR3508 - Tarefa1 Adult
#hash: PMR3508-2018-d59e43f3c1",No,5,76.0
# This is based on Fujisan's kernel on Invasive Species Monitoring,No,5,76.0
"# Grid Search is an ideal candidate for distributed machine learning
# Pseudo code for hyperparameters Grid Search

'''
from sklearn.grid_search import ParameterGrid
param_grid = {'epochs': [5, 10, 15], 'steps_per_epoch' : [10, 20, 50]}

grid = ParameterGrid(param_grid)

# Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model
for params in grid:
    print(params)
'''",No,5,76.0
"# Normalizar las imgenes (1pt) 

#x_train = x_train.reshape([1712, 96*96])/255
#x_val = x_val.reshape([428, 96*96])/255
#x_train[0] #valores entre 0 y 1, usara una capa de batchnormalization en la red

#Se realiz esto en iteraciones previas, el resultado fue peor, se decide no scalar a [0,1] ni utilizar batch normalization",No,5,76.0
"#labels_axis =np.array([['_x','_y']])
#labels_axis = np.repeat(labels_axis,26745,axis=0).flatten()
#labels_axis.shape",No,5,76.0
"#labels= np.core.defchararray.add(labels_area, labels_axis)
#labels.shape",No,5,76.0
"#ImageId = np.arange(1,1784)
#ImageId =np.repeat(ImageId, 30)
#ImageId.shape",No,5,76.0
"#RowId=np.int32(np.arange(1,53491))
#RowId.shape",No,5,76.0
"#sub = np.array([RowId,ImageId,labels,results])
#sub = np.swapaxes(sub,0,1)
#sub.shape",No,5,76.0
"#sub_df = pd.DataFrame(data=sub,columns=['RowId','ImageId','FeatureName','Location'])
#sub_df.ImageId = pd.to_numeric(sub_df.ImageId)
",No,5,76.0
#sub_df[(sub_df['FeatureName'] == 'left_eye_center_x') & (sub_df['ImageId'] == 1)],No,5,76.0
"# model = Sequential()

# model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1)))
# model.add(LeakyReLU(alpha=.001))
# model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model.add(Dropout(0.7))
# model.add(Flatten())
# model.add(Dense(256, activation = 'linear'))
# model.add(LeakyReLU(alpha=.001))
# model.add(Dropout(0.7))
# model.add(Dense(128, activation = 'linear'))
# model.add(LeakyReLU(alpha=.001))
# model.add(Dropout(0.7))
# model.add(Dense(30))",No,5,76.0
"# model2 = Sequential()

# model2.add(Conv2D(filters = 64, kernel_size = (4,4), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1)))
# model2.add(LeakyReLU(alpha=.001))
# model2.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model2.add(Dropout(0.5))
# model2.add(Flatten())
# model2.add(Dense(256, activation = 'linear'))
# model2.add(LeakyReLU(alpha=.001))
# model2.add(Dropout(0.7))
# model2.add(Dense(30))",No,5,76.0
"# model3 = Sequential()

# model3.add(Conv2D(filters = 128, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1)))
# model3.add(LeakyReLU(alpha=.001))
# model3.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model3.add(Dropout(0.5))
# model3.add(Flatten())
# model3.add(Dense(256, activation = 'linear'))
# model3.add(LeakyReLU(alpha=.001))
# model3.add(Dropout(0.5))
# model3.add(Dense(128, activation = 'linear'))
# model3.add(LeakyReLU(alpha=.001))
# model3.add(Dropout(0.7))
# model3.add(Dense(30))",No,5,76.0
"# np.random.seed(777)

# model4 = Sequential()

# model4.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu', input_shape = (96, 96, 1)))
# model4.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model4.add(Dropout(0.3))

# model4.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu'))
# model4.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model4.add(Dropout(0.5))

# model4.add(Flatten())
# model4.add(Dense(128, activation = 'relu'))
# model4.add(Dropout(0.5))
# model4.add(Dense(30, activation = 'linear'))",No,5,76.0
"# np.random.seed(777)

# model5 = Sequential()

# model5.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1)))
# model5.add(Conv2D(filters = 64, kernel_size = (4,4), padding = 'Same', activation = 'relu'))
# model5.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model5.add(Dropout(0.25))
# model5.add(Flatten())
# model5.add(Dense(256, activation = 'relu'))
# model5.add(Dropout(0.5))
# model5.add(Dense(128, activation = 'relu'))
# model5.add(Dropout(0.7))
# model5.add(Dense(30))",No,5,76.0
"# np.random.seed(777)

# model6 = Sequential()

# model6.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu', input_shape = (96, 96, 1)))
# model6.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model6.add(Dropout(0.3))

# model6.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'elu'))
# model6.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model6.add(Dropout(0.5))

# model6.add(Flatten())
# model6.add(Dense(256, activation = 'elu'))
# model6.add(Dropout(0.5))
# model6.add(Dense(128, activation = 'relu'))
# model6.add(Dropout(0.7))
# model6.add(Dense(30))",No,5,76.0
"# np.random.seed(777)

# model7 = Sequential()

# model7.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1)))
# model7.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model7.add(Dropout(0.5))

# model7.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu'))
# model7.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model7.add(Dropout(0.5))
# model7.add(Flatten())

# model7.add(Dense(128, activation = 'relu'))
# model7.add(Dropout(0.7))
# model7.add(Dense(30))",No,5,76.0
"# np.random.seed(777)

# model8 = Sequential()

# model8.add(Conv2D(filters = 64, kernel_size = (6,6), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1)))
# model8.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model8.add(Dropout(0.3))

# model8.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu'))
# model8.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model8.add(Dropout(0.5))
# model8.add(Flatten())

# model8.add(Dense(256, activation = 'relu'))
# model8.add(Dropout(0.5))
# model8.add(Dense(128, activation = 'relu'))
# model8.add(Dense(30))",No,5,76.0
"# np.random.seed(777)

# model9 = Sequential()

# model9.add(Conv2D(filters = 128, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1)))
# model9.add(LeakyReLU(alpha=.001))
# model9.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
# model9.add(Dropout(0.5))
# model9.add(Flatten())
# model9.add(Dense(256, activation = 'linear'))
# model9.add(LeakyReLU(alpha=.001))
# model9.add(Dropout(0.5))
# model9.add(Dense(128, activation = 'linear'))
# model9.add(LeakyReLU(alpha=.001))
# model9.add(Dropout(0.7))
# model9.add(Dense(30))",No,5,76.0
"# model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0
"# model2.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0
"# model3.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0
"# model4.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0
"# model5.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0
"# model6.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0
"# model7.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0
"# model8.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0
"# model9.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0
"# time1 = time.time()
# model.fit(x, y, epochs=100, batch_size=128, validation_split=0.2)
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# model2.fit(x, y, epochs=100, batch_size=100, validation_split=0.3)
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# model3.fit(x, y, epochs=100, batch_size=100, validation_split=0.2)
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# model4.fit(x, y, epochs=100, batch_size=100, validation_split=0.25)
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# history = model5.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer])
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# history = model6.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer])
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# history = model7.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer])
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# history = model8.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer])
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
"# time1 = time.time()
# history = model9.fit(x, y, validation_split=0.2, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer])
# time2 = time.time()
# print('Learning Finished!')
# chk_processting_time(time1, time2)",No,5,76.0
# !pip install kaggle,No,5,76.0
# !pip show kaggle,No,5,76.0
# !kaggle config path,No,5,76.0
# ! kaggle competitions submit -c facial-keypoints-detection -f predict.csv -m'submission,No,5,76.0
# !kaggle competitions submissions -c facial-keypoints-detection,No,5,76.0
"'''df=pd.get_dummies(df,columns=[ 'Month',
'District'],drop_first=True)
df=pd.get_dummies(df,columns=[ 'Year'],drop_first=True)
#df_test=pd.get_dummies(df_test,columns=['DayOfWeek','PdDistrict','Year','Month','Day','Hour','Minute'],drop_first=True)
'''

",No,5,76.0
"""""""df_test=pd.get_dummies(df_test,columns=[ \'Month\',
\'District\'],drop_first=True)
df_test=pd.get_dummies(df_test,columns=[ \'Year\'],drop_first=True)""""""",No,5,76.0
"#df=df[['Hour', 'Day', 'Month', 'Year', 'Address',
#       'District','X','radial60','Intersection']]",No,5,76.0
"#df=pd.get_dummies(df,columns=[ 'Hour'],drop_first=True)

",No,5,76.0
"#df_test=pd.get_dummies(df_test,columns=[ 'Hour'],drop_first=True)",No,5,76.0
#df.columns.nunique(),No,5,76.0
#df_test.columns.nunique(),No,5,76.0
"'''import lightgbm as lgb
model5= lgb.LGBMClassifier(objective='multiclass')

model5.fit(X_train,y_train)
y_final=model5.predict_proba(X_test)
print (log_loss(y_test,y_final));'''",No,5,76.0
"#print (log_loss(y_test,y_pred));",No,5,76.0
"""""""temp = data[\'Category\']

le.fit_transform(temp)
le.classes_
""""""",No,5,76.0
"# y_pred= pd.DataFrame(y_pred, index=Id_test,columns  = le.classes_)

",No,5,76.0
"#from sklearn.linear_model import LogisticRegression
#weight={Address:3,District:3,X:1,Day:2}
#weight={LARCENY/THEFT:35}
#classifier = LogisticRegression(penalty='l1',random_state = 0,class_weight='balanced',multi_class='multinomial', solver='saga',n_jobs=-1)
#classifier = LogisticRegression(random_state=0, penalty='l1',multi_class='multinomial', solver='saga' )
#classifier.fit(X_train[0:50000],y_train[0:50000])",No,5,76.0
#y_pred=model.predict_proba(X_test),No,5,76.0
"""""""from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm=\'auto\', leaf_size=60, metric=\'minkowski\',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights=\'uniform\')
knn.fit(X_train[0:100000], y_train[0:100000])

""""""

",No,5,76.0
#y_pred=knn.predict_proba(X_test),No,5,76.0